From 97523945d107244cc575bb44d31ad5d7f98be28f Mon Sep 17 00:00:00 2001 From: Youmu Date: Mon, 16 Jan 2023 15:31:05 -0500 Subject: [PATCH 1/3] md5: Add inline assembly support for `x86` and `x86_64` guarded by feature flag `inline-asm` --- Cargo.lock | 7 ++ md5/Cargo.toml | 2 + md5/README.md | 2 + md5/src/asm/mod.rs | 12 ++ md5/src/asm/x86.rs | 280 +++++++++++++++++++++++++++++++++++++++++++++ md5/src/lib.rs | 21 +++- 6 files changed, 323 insertions(+), 1 deletion(-) create mode 100644 md5/src/asm/mod.rs create mode 100644 md5/src/asm/x86.rs diff --git a/Cargo.lock b/Cargo.lock index d851c17b4..3cda52ecb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "asm_block" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "466c0990cf15ef0f331f19fdc16fd60229606ab237476c70a66b747fce2911ab" + [[package]] name = "blake2" version = "0.10.6" @@ -156,6 +162,7 @@ checksum = "db6d7e329c562c5dfab7a46a2afabc8b987ab9a4834c9d1ca04dc54c1546cef8" name = "md-5" version = "0.10.5" dependencies = [ + "asm_block", "digest", "hex-literal", "md5-asm", diff --git a/md5/Cargo.toml b/md5/Cargo.toml index 48fd5aff7..b27f8e7a9 100644 --- a/md5/Cargo.toml +++ b/md5/Cargo.toml @@ -19,6 +19,7 @@ digest = "0.10.4" [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] md5-asm = { version = "0.5", optional = true } +asm_block = { version = "0.1.3", optional = true } [dev-dependencies] digest = { version = "0.10.4", features = ["dev"] } @@ -28,4 +29,5 @@ hex-literal = "0.2.2" default = ["std"] std = ["digest/std"] asm = ["md5-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates +inline-asm = ["asm_block"] # Enable inline assembly support. WARNING: Bumps MSRV to 1.59 oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57 diff --git a/md5/README.md b/md5/README.md index 56af1749b..a162b608c 100644 --- a/md5/README.md +++ b/md5/README.md @@ -28,6 +28,8 @@ including HMAC-MD5. Rust **1.41** or higher. +Enabling feature flag `inline-asm` requires Rust **1.59** or higher. + Minimum supported Rust version can be changed in the future, but it will be done with a minor version bump. diff --git a/md5/src/asm/mod.rs b/md5/src/asm/mod.rs new file mode 100644 index 000000000..7f43d7d7a --- /dev/null +++ b/md5/src/asm/mod.rs @@ -0,0 +1,12 @@ +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +mod x86; + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub use x86::compress_block; + +#[inline] +pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) { + for block in blocks { + compress_block(state, block) + } +} diff --git a/md5/src/asm/x86.rs b/md5/src/asm/x86.rs new file mode 100644 index 000000000..f3387ad14 --- /dev/null +++ b/md5/src/asm/x86.rs @@ -0,0 +1,280 @@ +//! MD5 assembly code for `x86_64` and `x86`. Adapted from Project Nayuki. +/* + * MD5 hash in x86-64 assembly + * + * Copyright (c) 2016 Project Nayuki. (MIT License) + * https://www.nayuki.io/page/fast-md5-hash-implementation-in-x86-assembly + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * - The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * - The Software is provided "as is", without warranty of any kind, express or + * implied, including but not limited to the warranties of merchantability, + * fitness for a particular purpose and noninfringement. In no event shall the + * authors or copyright holders be liable for any claim, damages or other + * liability, whether in an action of contract, tort or otherwise, arising from, + * out of or in connection with the Software or the use or other dealings in the + * Software. + */ +use core::arch::asm; + +use asm_block::asm_block; + +/// MD5 operators +macro_rules! asm_md5_op { + (F, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => { + concat!( + asm_block! { + mov $tmp1, $c; + add $a, $k; + xor $tmp1, $d; + and $tmp1, $b; + xor $tmp1, $d; + }, + asm_md5_op!(END, $a, $b, $s, $t, $tmp1) + ) + }; + (G, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => { + concat!( + asm_block! { + mov $tmp1, $d; + mov $tmp2, $d; + add $a, $k; + not $tmp1; + and $tmp2, $b; + and $tmp1, $c; + or $tmp1, $tmp2; + }, + asm_md5_op!(END, $a, $b, $s, $t, $tmp1) + ) + }; + (H, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => { + concat!( + asm_block! { + mov $tmp1, $c; + add $a, $k; + xor $tmp1, $d; + xor $tmp1, $b; + }, + asm_md5_op!(END, $a, $b, $s, $t, $tmp1) + ) + }; + (I, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => { + concat!( + asm_block! { + mov $tmp1, $d; + not $tmp1; + add $a, $k; + or $tmp1, $b; + xor $tmp1, $c; + }, + asm_md5_op!(END, $a, $b, $s, $t, $tmp1) + ) + }; + (END, $a: tt, $b: tt, $s: literal, $t: literal, $tmp: tt) => { + asm_block! { + lea $a, [$a + $tmp + $t]; + rol $a, $s; + add $a, $b; + } + }; +} + +/// MD5 rounds, adding back the original value of states is omitted here +#[rustfmt::skip] +macro_rules! asm_md5 { + ( + // states + $a: tt, $b: tt, $c: tt, $d: tt, + // inputs + $x0: tt, $x1: tt, $x2: tt, $x3: tt, + $x4: tt, $x5: tt, $x6: tt, $x7: tt, + $x8: tt, $x9: tt, $xa: tt, $xb: tt, + $xc: tt, $xd: tt, $xe: tt, $xf: tt, + // clobbers + $t1: tt, $t2: tt + ) => { + concat!( + // round 1 + asm_md5_op!(F, $a, $b, $c, $d, $x0, 7, 0xd76aa478, $t1, $t2), + asm_md5_op!(F, $d, $a, $b, $c, $x1, 12, 0xe8c7b756, $t1, $t2), + asm_md5_op!(F, $c, $d, $a, $b, $x2, 17, 0x242070db, $t1, $t2), + asm_md5_op!(F, $b, $c, $d, $a, $x3, 22, 0xc1bdceee, $t1, $t2), + + asm_md5_op!(F, $a, $b, $c, $d, $x4, 7, 0xf57c0faf, $t1, $t2), + asm_md5_op!(F, $d, $a, $b, $c, $x5, 12, 0x4787c62a, $t1, $t2), + asm_md5_op!(F, $c, $d, $a, $b, $x6, 17, 0xa8304613, $t1, $t2), + asm_md5_op!(F, $b, $c, $d, $a, $x7, 22, 0xfd469501, $t1, $t2), + + asm_md5_op!(F, $a, $b, $c, $d, $x8, 7, 0x698098d8, $t1, $t2), + asm_md5_op!(F, $d, $a, $b, $c, $x9, 12, 0x8b44f7af, $t1, $t2), + asm_md5_op!(F, $c, $d, $a, $b, $xa, 17, 0xffff5bb1, $t1, $t2), + asm_md5_op!(F, $b, $c, $d, $a, $xb, 22, 0x895cd7be, $t1, $t2), + + asm_md5_op!(F, $a, $b, $c, $d, $xc, 7, 0x6b901122, $t1, $t2), + asm_md5_op!(F, $d, $a, $b, $c, $xd, 12, 0xfd987193, $t1, $t2), + asm_md5_op!(F, $c, $d, $a, $b, $xe, 17, 0xa679438e, $t1, $t2), + asm_md5_op!(F, $b, $c, $d, $a, $xf, 22, 0x49b40821, $t1, $t2), + + // round 2 + asm_md5_op!(G, $a, $b, $c, $d, $x1, 5, 0xf61e2562, $t1, $t2), + asm_md5_op!(G, $d, $a, $b, $c, $x6, 9, 0xc040b340, $t1, $t2), + asm_md5_op!(G, $c, $d, $a, $b, $xb, 14, 0x265e5a51, $t1, $t2), + asm_md5_op!(G, $b, $c, $d, $a, $x0, 20, 0xe9b6c7aa, $t1, $t2), + + asm_md5_op!(G, $a, $b, $c, $d, $x5, 5, 0xd62f105d, $t1, $t2), + asm_md5_op!(G, $d, $a, $b, $c, $xa, 9, 0x02441453, $t1, $t2), + asm_md5_op!(G, $c, $d, $a, $b, $xf, 14, 0xd8a1e681, $t1, $t2), + asm_md5_op!(G, $b, $c, $d, $a, $x4, 20, 0xe7d3fbc8, $t1, $t2), + + asm_md5_op!(G, $a, $b, $c, $d, $x9, 5, 0x21e1cde6, $t1, $t2), + asm_md5_op!(G, $d, $a, $b, $c, $xe, 9, 0xc33707d6, $t1, $t2), + asm_md5_op!(G, $c, $d, $a, $b, $x3, 14, 0xf4d50d87, $t1, $t2), + asm_md5_op!(G, $b, $c, $d, $a, $x8, 20, 0x455a14ed, $t1, $t2), + + asm_md5_op!(G, $a, $b, $c, $d, $xd, 5, 0xa9e3e905, $t1, $t2), + asm_md5_op!(G, $d, $a, $b, $c, $x2, 9, 0xfcefa3f8, $t1, $t2), + asm_md5_op!(G, $c, $d, $a, $b, $x7, 14, 0x676f02d9, $t1, $t2), + asm_md5_op!(G, $b, $c, $d, $a, $xc, 20, 0x8d2a4c8a, $t1, $t2), + + // round 3 + asm_md5_op!(H, $a, $b, $c, $d, $x5, 4, 0xfffa3942, $t1, $t2), + asm_md5_op!(H, $d, $a, $b, $c, $x8, 11, 0x8771f681, $t1, $t2), + asm_md5_op!(H, $c, $d, $a, $b, $xb, 16, 0x6d9d6122, $t1, $t2), + asm_md5_op!(H, $b, $c, $d, $a, $xe, 23, 0xfde5380c, $t1, $t2), + + asm_md5_op!(H, $a, $b, $c, $d, $x1, 4, 0xa4beea44, $t1, $t2), + asm_md5_op!(H, $d, $a, $b, $c, $x4, 11, 0x4bdecfa9, $t1, $t2), + asm_md5_op!(H, $c, $d, $a, $b, $x7, 16, 0xf6bb4b60, $t1, $t2), + asm_md5_op!(H, $b, $c, $d, $a, $xa, 23, 0xbebfbc70, $t1, $t2), + + asm_md5_op!(H, $a, $b, $c, $d, $xd, 4, 0x289b7ec6, $t1, $t2), + asm_md5_op!(H, $d, $a, $b, $c, $x0, 11, 0xeaa127fa, $t1, $t2), + asm_md5_op!(H, $c, $d, $a, $b, $x3, 16, 0xd4ef3085, $t1, $t2), + asm_md5_op!(H, $b, $c, $d, $a, $x6, 23, 0x04881d05, $t1, $t2), + + asm_md5_op!(H, $a, $b, $c, $d, $x9, 4, 0xd9d4d039, $t1, $t2), + asm_md5_op!(H, $d, $a, $b, $c, $xc, 11, 0xe6db99e5, $t1, $t2), + asm_md5_op!(H, $c, $d, $a, $b, $xf, 16, 0x1fa27cf8, $t1, $t2), + asm_md5_op!(H, $b, $c, $d, $a, $x2, 23, 0xc4ac5665, $t1, $t2), + + // round 4 + asm_md5_op!(I, $a, $b, $c, $d, $x0, 6, 0xf4292244, $t1, $t2), + asm_md5_op!(I, $d, $a, $b, $c, $x7, 10, 0x432aff97, $t1, $t2), + asm_md5_op!(I, $c, $d, $a, $b, $xe, 15, 0xab9423a7, $t1, $t2), + asm_md5_op!(I, $b, $c, $d, $a, $x5, 21, 0xfc93a039, $t1, $t2), + + asm_md5_op!(I, $a, $b, $c, $d, $xc, 6, 0x655b59c3, $t1, $t2), + asm_md5_op!(I, $d, $a, $b, $c, $x3, 10, 0x8f0ccc92, $t1, $t2), + asm_md5_op!(I, $c, $d, $a, $b, $xa, 15, 0xffeff47d, $t1, $t2), + asm_md5_op!(I, $b, $c, $d, $a, $x1, 21, 0x85845dd1, $t1, $t2), + + asm_md5_op!(I, $a, $b, $c, $d, $x8, 6, 0x6fa87e4f, $t1, $t2), + asm_md5_op!(I, $d, $a, $b, $c, $xf, 10, 0xfe2ce6e0, $t1, $t2), + asm_md5_op!(I, $c, $d, $a, $b, $x6, 15, 0xa3014314, $t1, $t2), + asm_md5_op!(I, $b, $c, $d, $a, $xd, 21, 0x4e0811a1, $t1, $t2), + + asm_md5_op!(I, $a, $b, $c, $d, $x4, 6, 0xf7537e82, $t1, $t2), + asm_md5_op!(I, $d, $a, $b, $c, $xb, 10, 0xbd3af235, $t1, $t2), + asm_md5_op!(I, $c, $d, $a, $b, $x2, 15, 0x2ad7d2bb, $t1, $t2), + asm_md5_op!(I, $b, $c, $d, $a, $x9, 21, 0xeb86d391, $t1, $t2), + ) + }; +} + +/// MD5 compress function. We don't have enough registers to load the whole block, +/// so we need to use memory address to refer to the inputs. But there are enough +/// registers to to house states, block address, and clobbers (7 in total), so we +/// can use automatical register allocation. +#[cfg(target_arch = "x86_64")] +pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) { + let mut temp_state: [u32; 4] = *state; + + // SAFETY: inline-assembly + unsafe { + asm!( + asm_md5!( + // states + {a:e}, {b:e}, {c:e}, {d:e}, + // inputs + [{x} + 0], [{x} + 4], [{x} + 8], [{x} + 12], + [{x} + 16], [{x} + 20], [{x} + 24], [{x} + 28], + [{x} + 32], [{x} + 36], [{x} + 40], [{x} + 44], + [{x} + 48], [{x} + 52], [{x} + 56], [{x} + 60], + // clobbers + {t1:e}, {t2:e} + ), + + // states + a = inout(reg) temp_state[0], + b = inout(reg) temp_state[1], + c = inout(reg) temp_state[2], + d = inout(reg) temp_state[3], + // inputs + x = in(reg) block.as_ptr(), + // clobbers + t1 = out(reg) _, + t2 = out(reg) _, + ); + } + + // update states + state[0] = state[0].wrapping_add(temp_state[0]); + state[1] = state[1].wrapping_add(temp_state[1]); + state[2] = state[2].wrapping_add(temp_state[2]); + state[3] = state[3].wrapping_add(temp_state[3]); +} + +/// MD5 compress function. We don't have enough registers to load the whole block, +/// so we need to use memory address to refer to the inputs. Due to possible failure +/// of register allocation on `x86`, we explicitly specify registers to use. +#[cfg(target_arch = "x86")] +pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) { + let mut temp_state: [u32; 4] = *state; + + // SAFETY: inline-assembly + unsafe { + asm!( + // Save esi and ebp + "sub esp, 8", + "mov [esp + 0], esi", + "mov [esp + 4], ebp", + + asm_md5!( + // states + eax, ebx, ecx, edx, + // inputs + [edi + 0], [edi + 4], [edi + 8], [edi + 12], + [edi + 16], [edi + 20], [edi + 24], [edi + 28], + [edi + 32], [edi + 36], [edi + 40], [edi + 44], + [edi + 48], [edi + 52], [edi + 56], [edi + 60], + // clobbers + esi, ebp + ), + + // Restore esi and ebp + "mov ebp, [esp + 4]", + "mov esi, [esp + 0]", + "add esp, 8", + + // states + inout("eax") temp_state[0], + inout("ebx") temp_state[1], + inout("ecx") temp_state[2], + inout("edx") temp_state[3], + // inputs + in("edi") block.as_ptr(), + ); + } + + // update states + state[0] = state[0].wrapping_add(temp_state[0]); + state[1] = state[1].wrapping_add(temp_state[1]); + state[2] = state[2].wrapping_add(temp_state[2]); + state[3] = state[3].wrapping_add(temp_state[3]); +} diff --git a/md5/src/lib.rs b/md5/src/lib.rs index 87fe9134f..47599d0bf 100644 --- a/md5/src/lib.rs +++ b/md5/src/lib.rs @@ -33,13 +33,32 @@ #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] extern crate md5_asm as compress; -#[cfg(not(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64"))))] +#[cfg(all( + feature = "inline-asm", + any(target_arch = "x86", target_arch = "x86_64") +))] +mod asm; + +#[cfg(not(all( + any(feature = "asm", feature = "inline-asm"), + any(target_arch = "x86", target_arch = "x86_64") +)))] mod compress; pub use digest::{self, Digest}; +#[cfg(not(all( + feature = "inline-asm", + any(target_arch = "x86", target_arch = "x86_64") +)))] use compress::compress; +#[cfg(all( + feature = "inline-asm", + any(target_arch = "x86", target_arch = "x86_64") +))] +use asm::compress; + use core::{fmt, slice::from_ref}; #[cfg(feature = "oid")] use digest::const_oid::{AssociatedOid, ObjectIdentifier}; From 40a315dea9d5619e664c3b70749d0af692bca8b8 Mon Sep 17 00:00:00 2001 From: Youmu Date: Mon, 16 Jan 2023 16:46:49 -0500 Subject: [PATCH 2/3] md5: add CI to test inline assembly --- .github/workflows/md5.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/md5.yml b/.github/workflows/md5.yml index 681453d47..31253571c 100644 --- a/.github/workflows/md5.yml +++ b/.github/workflows/md5.yml @@ -83,3 +83,20 @@ jobs: toolchain: ${{ matrix.rust }} override: true - run: cargo test --features oid + + # TODO: merge with test on MSRV bump to 1.59 or higher + test-inline-asm: + runs-on: ubuntu-latest + strategy: + matrix: + rust: + - 1.59.0 # MSRV + steps: + - uses: actions/checkout@v3 + - uses: RustCrypto/actions/cargo-cache@master + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: ${{ matrix.rust }} + override: true + - run: cargo test --features inline-asm \ No newline at end of file From 8a4067091a7ddf1eb6e55ebb9f42b54aa1efb566 Mon Sep 17 00:00:00 2001 From: Youmu Date: Sun, 22 Jan 2023 15:59:26 -0500 Subject: [PATCH 3/3] md5: move block looping inside inline assembly --- md5/src/asm/mod.rs | 9 +-- md5/src/asm/x86.rs | 150 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 116 insertions(+), 43 deletions(-) diff --git a/md5/src/asm/mod.rs b/md5/src/asm/mod.rs index 7f43d7d7a..981aa85cb 100644 --- a/md5/src/asm/mod.rs +++ b/md5/src/asm/mod.rs @@ -2,11 +2,4 @@ mod x86; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub use x86::compress_block; - -#[inline] -pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) { - for block in blocks { - compress_block(state, block) - } -} +pub use x86::compress; diff --git a/md5/src/asm/x86.rs b/md5/src/asm/x86.rs index f3387ad14..59b185075 100644 --- a/md5/src/asm/x86.rs +++ b/md5/src/asm/x86.rs @@ -189,15 +189,24 @@ macro_rules! asm_md5 { /// MD5 compress function. We don't have enough registers to load the whole block, /// so we need to use memory address to refer to the inputs. But there are enough -/// registers to to house states, block address, and clobbers (7 in total), so we +/// registers to to house states, block address, and clobbers (12 in total), so we /// can use automatical register allocation. #[cfg(target_arch = "x86_64")] -pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) { - let mut temp_state: [u32; 4] = *state; - +pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) { // SAFETY: inline-assembly unsafe { asm!( + // exit if no block + "cmp {cnt}, 0", + "jz 3f", + + "2:", + // duplicate state vector for this iteration + "mov {a:e}, {sa:e}", + "mov {b:e}, {sb:e}", + "mov {c:e}, {sc:e}", + "mov {d:e}, {sd:e}", + asm_md5!( // states {a:e}, {b:e}, {c:e}, {d:e}, @@ -210,41 +219,81 @@ pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) { {t1:e}, {t2:e} ), - // states - a = inout(reg) temp_state[0], - b = inout(reg) temp_state[1], - c = inout(reg) temp_state[2], - d = inout(reg) temp_state[3], + // update state + "add {sa:e}, {a:e}", + "add {sb:e}, {b:e}", + "add {sc:e}, {c:e}", + "add {sd:e}, {d:e}", + + // check end of loop? + "dec {cnt}", + "jz 3f", + + // advance block pointer + // 4 * 16 = 64 bytes + "add {x}, 64", + + "jmp 2b", + + // exit + "3:", + + // states clobbers + a = out(reg) _, + b = out(reg) _, + c = out(reg) _, + d = out(reg) _, + // output states + sa = inout(reg) state[0], + sb = inout(reg) state[1], + sc = inout(reg) state[2], + sd = inout(reg) state[3], // inputs - x = in(reg) block.as_ptr(), + x = in(reg) blocks.as_ptr(), + cnt = in(reg) blocks.len(), // clobbers t1 = out(reg) _, t2 = out(reg) _, ); } - - // update states - state[0] = state[0].wrapping_add(temp_state[0]); - state[1] = state[1].wrapping_add(temp_state[1]); - state[2] = state[2].wrapping_add(temp_state[2]); - state[3] = state[3].wrapping_add(temp_state[3]); } /// MD5 compress function. We don't have enough registers to load the whole block, /// so we need to use memory address to refer to the inputs. Due to possible failure /// of register allocation on `x86`, we explicitly specify registers to use. #[cfg(target_arch = "x86")] -pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) { - let mut temp_state: [u32; 4] = *state; - +pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) { // SAFETY: inline-assembly unsafe { asm!( - // Save esi and ebp - "sub esp, 8", + // exit if no block + "cmp ebx, 0", + "jz 4f", + + // save esi and ebp + // save state vector address + // move block count to stack + "sub esp, 32", "mov [esp + 0], esi", "mov [esp + 4], ebp", + // address of `state` + "mov [esp + 8], eax", + // block count + "mov [esp + 12], ebx", + + // we can now use all registers + // we will move eax into ebp, save states on stack and set eax-edx as states + "mov ebp, eax", + "mov eax, [ebp + 0]", + "mov [esp + 16], eax", + "mov ebx, [ebp + 4]", + "mov [esp + 20], ebx", + "mov ecx, [ebp + 8]", + "mov [esp + 24], ecx", + "mov edx, [ebp + 12]", + "mov [esp + 28], edx", + "2:", asm_md5!( // states eax, ebx, ecx, edx, @@ -257,24 +306,55 @@ pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) { esi, ebp ), - // Restore esi and ebp - "mov ebp, [esp + 4]", + // update state + "add eax, [esp + 16]", + "add ebx, [esp + 20]", + "add ecx, [esp + 24]", + "add edx, [esp + 28]", + + // check end of loop? + "mov esi, [esp + 12]", + "dec esi", + "jz 3f", + + // save current state to stack + "mov [esp + 16], eax", + "mov [esp + 20], ebx", + "mov [esp + 24], ecx", + "mov [esp + 28], edx", + "mov [esp + 12], esi", + + // advance block pointer + // 4 * 16 = 64 bytes + "add edi, 64", + + "jmp 2b", + + "3:", + // write to state vector + "mov ebp, [esp + 8]", + "mov [ebp + 0], eax", + "mov [ebp + 4], ebx", + "mov [ebp + 8], ecx", + "mov [ebp + 12], edx", + + // restore esi and ebp "mov esi, [esp + 0]", - "add esp, 8", + "mov ebp, [esp + 4]", + "add esp, 32", + + // exit + "4:", // states - inout("eax") temp_state[0], - inout("ebx") temp_state[1], - inout("ecx") temp_state[2], - inout("edx") temp_state[3], + inout("eax") state.as_mut_ptr() => _, // inputs - in("edi") block.as_ptr(), + inout("edi") blocks.as_ptr() => _, + inout("ebx") blocks.len() => _, + + // clobbers + out("ecx") _, + out("edx") _, ); } - - // update states - state[0] = state[0].wrapping_add(temp_state[0]); - state[1] = state[1].wrapping_add(temp_state[1]); - state[2] = state[2].wrapping_add(temp_state[2]); - state[3] = state[3].wrapping_add(temp_state[3]); }