From 97523945d107244cc575bb44d31ad5d7f98be28f Mon Sep 17 00:00:00 2001
From: Youmu <johnmave126@gmail.com>
Date: Mon, 16 Jan 2023 15:31:05 -0500
Subject: [PATCH 1/3] md5: Add inline assembly support for `x86` and `x86_64`
 guarded by feature flag `inline-asm`

---
 Cargo.lock         |   7 ++
 md5/Cargo.toml     |   2 +
 md5/README.md      |   2 +
 md5/src/asm/mod.rs |  12 ++
 md5/src/asm/x86.rs | 280 +++++++++++++++++++++++++++++++++++++++++++++
 md5/src/lib.rs     |  21 +++-
 6 files changed, 323 insertions(+), 1 deletion(-)
 create mode 100644 md5/src/asm/mod.rs
 create mode 100644 md5/src/asm/x86.rs

diff --git a/Cargo.lock b/Cargo.lock
index d851c17b4..3cda52ecb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "asm_block"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "466c0990cf15ef0f331f19fdc16fd60229606ab237476c70a66b747fce2911ab"
+
 [[package]]
 name = "blake2"
 version = "0.10.6"
@@ -156,6 +162,7 @@ checksum = "db6d7e329c562c5dfab7a46a2afabc8b987ab9a4834c9d1ca04dc54c1546cef8"
 name = "md-5"
 version = "0.10.5"
 dependencies = [
+ "asm_block",
  "digest",
  "hex-literal",
  "md5-asm",
diff --git a/md5/Cargo.toml b/md5/Cargo.toml
index 48fd5aff7..b27f8e7a9 100644
--- a/md5/Cargo.toml
+++ b/md5/Cargo.toml
@@ -19,6 +19,7 @@ digest = "0.10.4"
 
 [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
 md5-asm = { version = "0.5", optional = true }
+asm_block = { version = "0.1.3", optional = true }
 
 [dev-dependencies]
 digest = { version = "0.10.4", features = ["dev"] }
@@ -28,4 +29,5 @@ hex-literal = "0.2.2"
 default = ["std"]
 std = ["digest/std"]
 asm = ["md5-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates
+inline-asm = ["asm_block"] # Enable inline assembly support. WARNING: Bumps MSRV to 1.59
 oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57
diff --git a/md5/README.md b/md5/README.md
index 56af1749b..a162b608c 100644
--- a/md5/README.md
+++ b/md5/README.md
@@ -28,6 +28,8 @@ including HMAC-MD5.
 
 Rust **1.41** or higher.
 
+Enabling feature flag `inline-asm` requires Rust **1.59** or higher.
+
 Minimum supported Rust version can be changed in the future, but it will be
 done with a minor version bump.
 
diff --git a/md5/src/asm/mod.rs b/md5/src/asm/mod.rs
new file mode 100644
index 000000000..7f43d7d7a
--- /dev/null
+++ b/md5/src/asm/mod.rs
@@ -0,0 +1,12 @@
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+mod x86;
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub use x86::compress_block;
+
+#[inline]
+pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) {
+    for block in blocks {
+        compress_block(state, block)
+    }
+}
diff --git a/md5/src/asm/x86.rs b/md5/src/asm/x86.rs
new file mode 100644
index 000000000..f3387ad14
--- /dev/null
+++ b/md5/src/asm/x86.rs
@@ -0,0 +1,280 @@
+//! MD5 assembly code for `x86_64` and `x86`. Adapted from Project Nayuki.
+/*
+ * MD5 hash in x86-64 assembly
+ *
+ * Copyright (c) 2016 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-md5-hash-implementation-in-x86-assembly
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+use core::arch::asm;
+
+use asm_block::asm_block;
+
+/// MD5 operators
+macro_rules! asm_md5_op {
+    (F, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => {
+        concat!(
+            asm_block! {
+                mov $tmp1, $c;
+                add $a, $k;
+                xor $tmp1, $d;
+                and $tmp1, $b;
+                xor $tmp1, $d;
+            },
+            asm_md5_op!(END, $a, $b, $s, $t, $tmp1)
+        )
+    };
+    (G, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => {
+        concat!(
+            asm_block! {
+                mov $tmp1, $d;
+                mov $tmp2, $d;
+                add $a, $k;
+                not $tmp1;
+                and $tmp2, $b;
+                and $tmp1, $c;
+                or $tmp1, $tmp2;
+            },
+            asm_md5_op!(END, $a, $b, $s, $t, $tmp1)
+        )
+    };
+    (H, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => {
+        concat!(
+            asm_block! {
+                mov $tmp1, $c;
+                add $a, $k;
+                xor $tmp1, $d;
+                xor $tmp1, $b;
+            },
+            asm_md5_op!(END, $a, $b, $s, $t, $tmp1)
+        )
+    };
+    (I, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => {
+        concat!(
+            asm_block! {
+                mov $tmp1, $d;
+                not $tmp1;
+                add $a, $k;
+                or $tmp1, $b;
+                xor $tmp1, $c;
+            },
+            asm_md5_op!(END, $a, $b, $s, $t, $tmp1)
+        )
+    };
+    (END, $a: tt, $b: tt, $s: literal, $t: literal, $tmp: tt) => {
+        asm_block! {
+            lea $a, [$a + $tmp + $t];
+            rol $a, $s;
+            add $a, $b;
+        }
+    };
+}
+
+/// MD5 rounds, adding back the original value of states is omitted here
+#[rustfmt::skip]
+macro_rules! asm_md5 {
+    (
+        // states
+        $a: tt, $b: tt, $c: tt, $d: tt,
+        // inputs
+        $x0: tt, $x1: tt, $x2: tt, $x3: tt,
+        $x4: tt, $x5: tt, $x6: tt, $x7: tt,
+        $x8: tt, $x9: tt, $xa: tt, $xb: tt,
+        $xc: tt, $xd: tt, $xe: tt, $xf: tt,
+        // clobbers
+        $t1: tt, $t2: tt
+     ) => {
+        concat!(
+            // round 1
+            asm_md5_op!(F, $a, $b, $c, $d, $x0,  7, 0xd76aa478, $t1, $t2),
+            asm_md5_op!(F, $d, $a, $b, $c, $x1, 12, 0xe8c7b756, $t1, $t2),
+            asm_md5_op!(F, $c, $d, $a, $b, $x2, 17, 0x242070db, $t1, $t2),
+            asm_md5_op!(F, $b, $c, $d, $a, $x3, 22, 0xc1bdceee, $t1, $t2),
+   
+            asm_md5_op!(F, $a, $b, $c, $d, $x4,  7, 0xf57c0faf, $t1, $t2),
+            asm_md5_op!(F, $d, $a, $b, $c, $x5, 12, 0x4787c62a, $t1, $t2),
+            asm_md5_op!(F, $c, $d, $a, $b, $x6, 17, 0xa8304613, $t1, $t2),
+            asm_md5_op!(F, $b, $c, $d, $a, $x7, 22, 0xfd469501, $t1, $t2),
+ 
+            asm_md5_op!(F, $a, $b, $c, $d, $x8,  7, 0x698098d8, $t1, $t2),
+            asm_md5_op!(F, $d, $a, $b, $c, $x9, 12, 0x8b44f7af, $t1, $t2),
+            asm_md5_op!(F, $c, $d, $a, $b, $xa, 17, 0xffff5bb1, $t1, $t2),
+            asm_md5_op!(F, $b, $c, $d, $a, $xb, 22, 0x895cd7be, $t1, $t2),
+ 
+            asm_md5_op!(F, $a, $b, $c, $d, $xc,  7, 0x6b901122, $t1, $t2),
+            asm_md5_op!(F, $d, $a, $b, $c, $xd, 12, 0xfd987193, $t1, $t2),
+            asm_md5_op!(F, $c, $d, $a, $b, $xe, 17, 0xa679438e, $t1, $t2),
+            asm_md5_op!(F, $b, $c, $d, $a, $xf, 22, 0x49b40821, $t1, $t2),
+
+            // round 2
+            asm_md5_op!(G, $a, $b, $c, $d, $x1,  5, 0xf61e2562, $t1, $t2),
+            asm_md5_op!(G, $d, $a, $b, $c, $x6,  9, 0xc040b340, $t1, $t2),
+            asm_md5_op!(G, $c, $d, $a, $b, $xb, 14, 0x265e5a51, $t1, $t2),
+            asm_md5_op!(G, $b, $c, $d, $a, $x0, 20, 0xe9b6c7aa, $t1, $t2),
+
+            asm_md5_op!(G, $a, $b, $c, $d, $x5,  5, 0xd62f105d, $t1, $t2),
+            asm_md5_op!(G, $d, $a, $b, $c, $xa,  9, 0x02441453, $t1, $t2),
+            asm_md5_op!(G, $c, $d, $a, $b, $xf, 14, 0xd8a1e681, $t1, $t2),
+            asm_md5_op!(G, $b, $c, $d, $a, $x4, 20, 0xe7d3fbc8, $t1, $t2),
+
+            asm_md5_op!(G, $a, $b, $c, $d, $x9,  5, 0x21e1cde6, $t1, $t2),
+            asm_md5_op!(G, $d, $a, $b, $c, $xe,  9, 0xc33707d6, $t1, $t2),
+            asm_md5_op!(G, $c, $d, $a, $b, $x3, 14, 0xf4d50d87, $t1, $t2),
+            asm_md5_op!(G, $b, $c, $d, $a, $x8, 20, 0x455a14ed, $t1, $t2),
+
+            asm_md5_op!(G, $a, $b, $c, $d, $xd,  5, 0xa9e3e905, $t1, $t2),
+            asm_md5_op!(G, $d, $a, $b, $c, $x2,  9, 0xfcefa3f8, $t1, $t2),
+            asm_md5_op!(G, $c, $d, $a, $b, $x7, 14, 0x676f02d9, $t1, $t2),
+            asm_md5_op!(G, $b, $c, $d, $a, $xc, 20, 0x8d2a4c8a, $t1, $t2),
+
+            // round 3
+            asm_md5_op!(H, $a, $b, $c, $d, $x5,  4, 0xfffa3942, $t1, $t2),
+            asm_md5_op!(H, $d, $a, $b, $c, $x8, 11, 0x8771f681, $t1, $t2),
+            asm_md5_op!(H, $c, $d, $a, $b, $xb, 16, 0x6d9d6122, $t1, $t2),
+            asm_md5_op!(H, $b, $c, $d, $a, $xe, 23, 0xfde5380c, $t1, $t2),
+
+            asm_md5_op!(H, $a, $b, $c, $d, $x1,  4, 0xa4beea44, $t1, $t2),
+            asm_md5_op!(H, $d, $a, $b, $c, $x4, 11, 0x4bdecfa9, $t1, $t2),
+            asm_md5_op!(H, $c, $d, $a, $b, $x7, 16, 0xf6bb4b60, $t1, $t2),
+            asm_md5_op!(H, $b, $c, $d, $a, $xa, 23, 0xbebfbc70, $t1, $t2),
+
+            asm_md5_op!(H, $a, $b, $c, $d, $xd,  4, 0x289b7ec6, $t1, $t2),
+            asm_md5_op!(H, $d, $a, $b, $c, $x0, 11, 0xeaa127fa, $t1, $t2),
+            asm_md5_op!(H, $c, $d, $a, $b, $x3, 16, 0xd4ef3085, $t1, $t2),
+            asm_md5_op!(H, $b, $c, $d, $a, $x6, 23, 0x04881d05, $t1, $t2),
+
+            asm_md5_op!(H, $a, $b, $c, $d, $x9,  4, 0xd9d4d039, $t1, $t2),
+            asm_md5_op!(H, $d, $a, $b, $c, $xc, 11, 0xe6db99e5, $t1, $t2),
+            asm_md5_op!(H, $c, $d, $a, $b, $xf, 16, 0x1fa27cf8, $t1, $t2),
+            asm_md5_op!(H, $b, $c, $d, $a, $x2, 23, 0xc4ac5665, $t1, $t2),
+
+            // round 4
+            asm_md5_op!(I, $a, $b, $c, $d, $x0,  6, 0xf4292244, $t1, $t2),
+            asm_md5_op!(I, $d, $a, $b, $c, $x7, 10, 0x432aff97, $t1, $t2),
+            asm_md5_op!(I, $c, $d, $a, $b, $xe, 15, 0xab9423a7, $t1, $t2),
+            asm_md5_op!(I, $b, $c, $d, $a, $x5, 21, 0xfc93a039, $t1, $t2),
+
+            asm_md5_op!(I, $a, $b, $c, $d, $xc,  6, 0x655b59c3, $t1, $t2),
+            asm_md5_op!(I, $d, $a, $b, $c, $x3, 10, 0x8f0ccc92, $t1, $t2),
+            asm_md5_op!(I, $c, $d, $a, $b, $xa, 15, 0xffeff47d, $t1, $t2),
+            asm_md5_op!(I, $b, $c, $d, $a, $x1, 21, 0x85845dd1, $t1, $t2),
+
+            asm_md5_op!(I, $a, $b, $c, $d, $x8,  6, 0x6fa87e4f, $t1, $t2),
+            asm_md5_op!(I, $d, $a, $b, $c, $xf, 10, 0xfe2ce6e0, $t1, $t2),
+            asm_md5_op!(I, $c, $d, $a, $b, $x6, 15, 0xa3014314, $t1, $t2),
+            asm_md5_op!(I, $b, $c, $d, $a, $xd, 21, 0x4e0811a1, $t1, $t2),
+
+            asm_md5_op!(I, $a, $b, $c, $d, $x4,  6, 0xf7537e82, $t1, $t2),
+            asm_md5_op!(I, $d, $a, $b, $c, $xb, 10, 0xbd3af235, $t1, $t2),
+            asm_md5_op!(I, $c, $d, $a, $b, $x2, 15, 0x2ad7d2bb, $t1, $t2),
+            asm_md5_op!(I, $b, $c, $d, $a, $x9, 21, 0xeb86d391, $t1, $t2),
+        )
+    };
+}
+
+/// MD5 compress function. We don't have enough registers to load the whole block,
+/// so we need to use memory address to refer to the inputs. But there are enough
+/// registers to to house states, block address, and clobbers (7 in total), so we
+/// can use automatical register allocation.
+#[cfg(target_arch = "x86_64")]
+pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) {
+    let mut temp_state: [u32; 4] = *state;
+
+    // SAFETY: inline-assembly
+    unsafe {
+        asm!(
+            asm_md5!(
+                // states
+                {a:e}, {b:e}, {c:e}, {d:e},
+                // inputs
+                [{x} +  0], [{x} +  4], [{x} +  8], [{x} + 12],
+                [{x} + 16], [{x} + 20], [{x} + 24], [{x} + 28],
+                [{x} + 32], [{x} + 36], [{x} + 40], [{x} + 44],
+                [{x} + 48], [{x} + 52], [{x} + 56], [{x} + 60],
+                // clobbers
+                {t1:e}, {t2:e}
+            ),
+
+            // states
+            a = inout(reg) temp_state[0],
+            b = inout(reg) temp_state[1],
+            c = inout(reg) temp_state[2],
+            d = inout(reg) temp_state[3],
+            // inputs
+            x = in(reg) block.as_ptr(),
+            // clobbers
+            t1 = out(reg) _,
+            t2 = out(reg) _,
+        );
+    }
+
+    // update states
+    state[0] = state[0].wrapping_add(temp_state[0]);
+    state[1] = state[1].wrapping_add(temp_state[1]);
+    state[2] = state[2].wrapping_add(temp_state[2]);
+    state[3] = state[3].wrapping_add(temp_state[3]);
+}
+
+/// MD5 compress function. We don't have enough registers to load the whole block,
+/// so we need to use memory address to refer to the inputs. Due to possible failure
+/// of register allocation on `x86`, we explicitly specify registers to use.
+#[cfg(target_arch = "x86")]
+pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) {
+    let mut temp_state: [u32; 4] = *state;
+
+    // SAFETY: inline-assembly
+    unsafe {
+        asm!(
+            // Save esi and ebp
+            "sub esp, 8",
+            "mov [esp + 0], esi",
+            "mov [esp + 4], ebp",
+
+            asm_md5!(
+                // states
+                eax, ebx, ecx, edx,
+                // inputs
+                [edi +  0], [edi +  4], [edi +  8], [edi + 12],
+                [edi + 16], [edi + 20], [edi + 24], [edi + 28],
+                [edi + 32], [edi + 36], [edi + 40], [edi + 44],
+                [edi + 48], [edi + 52], [edi + 56], [edi + 60],
+                // clobbers
+                esi, ebp
+            ),
+
+            // Restore esi and ebp
+            "mov ebp, [esp + 4]",
+            "mov esi, [esp + 0]",
+            "add esp, 8",
+
+            // states
+            inout("eax") temp_state[0],
+            inout("ebx") temp_state[1],
+            inout("ecx") temp_state[2],
+            inout("edx") temp_state[3],
+            // inputs
+            in("edi") block.as_ptr(),
+        );
+    }
+
+    // update states
+    state[0] = state[0].wrapping_add(temp_state[0]);
+    state[1] = state[1].wrapping_add(temp_state[1]);
+    state[2] = state[2].wrapping_add(temp_state[2]);
+    state[3] = state[3].wrapping_add(temp_state[3]);
+}
diff --git a/md5/src/lib.rs b/md5/src/lib.rs
index 87fe9134f..47599d0bf 100644
--- a/md5/src/lib.rs
+++ b/md5/src/lib.rs
@@ -33,13 +33,32 @@
 #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))]
 extern crate md5_asm as compress;
 
-#[cfg(not(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64"))))]
+#[cfg(all(
+    feature = "inline-asm",
+    any(target_arch = "x86", target_arch = "x86_64")
+))]
+mod asm;
+
+#[cfg(not(all(
+    any(feature = "asm", feature = "inline-asm"),
+    any(target_arch = "x86", target_arch = "x86_64")
+)))]
 mod compress;
 
 pub use digest::{self, Digest};
 
+#[cfg(not(all(
+    feature = "inline-asm",
+    any(target_arch = "x86", target_arch = "x86_64")
+)))]
 use compress::compress;
 
+#[cfg(all(
+    feature = "inline-asm",
+    any(target_arch = "x86", target_arch = "x86_64")
+))]
+use asm::compress;
+
 use core::{fmt, slice::from_ref};
 #[cfg(feature = "oid")]
 use digest::const_oid::{AssociatedOid, ObjectIdentifier};

From 40a315dea9d5619e664c3b70749d0af692bca8b8 Mon Sep 17 00:00:00 2001
From: Youmu <johnmave126@gmail.com>
Date: Mon, 16 Jan 2023 16:46:49 -0500
Subject: [PATCH 2/3] md5: add CI to test inline assembly

---
 .github/workflows/md5.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/md5.yml b/.github/workflows/md5.yml
index 681453d47..31253571c 100644
--- a/.github/workflows/md5.yml
+++ b/.github/workflows/md5.yml
@@ -83,3 +83,20 @@ jobs:
           toolchain: ${{ matrix.rust }}
           override: true
       - run: cargo test --features oid
+
+  # TODO: merge with test on MSRV bump to 1.59 or higher
+  test-inline-asm:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        rust:
+          - 1.59.0 # MSRV
+    steps:
+      - uses: actions/checkout@v3
+      - uses: RustCrypto/actions/cargo-cache@master
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: ${{ matrix.rust }}
+          override: true
+      - run: cargo test --features inline-asm
\ No newline at end of file

From 8a4067091a7ddf1eb6e55ebb9f42b54aa1efb566 Mon Sep 17 00:00:00 2001
From: Youmu <johnmave126@gmail.com>
Date: Sun, 22 Jan 2023 15:59:26 -0500
Subject: [PATCH 3/3] md5: move block looping inside inline assembly

---
 md5/src/asm/mod.rs |   9 +--
 md5/src/asm/x86.rs | 150 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 116 insertions(+), 43 deletions(-)

diff --git a/md5/src/asm/mod.rs b/md5/src/asm/mod.rs
index 7f43d7d7a..981aa85cb 100644
--- a/md5/src/asm/mod.rs
+++ b/md5/src/asm/mod.rs
@@ -2,11 +2,4 @@
 mod x86;
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-pub use x86::compress_block;
-
-#[inline]
-pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) {
-    for block in blocks {
-        compress_block(state, block)
-    }
-}
+pub use x86::compress;
diff --git a/md5/src/asm/x86.rs b/md5/src/asm/x86.rs
index f3387ad14..59b185075 100644
--- a/md5/src/asm/x86.rs
+++ b/md5/src/asm/x86.rs
@@ -189,15 +189,24 @@ macro_rules! asm_md5 {
 
 /// MD5 compress function. We don't have enough registers to load the whole block,
 /// so we need to use memory address to refer to the inputs. But there are enough
-/// registers to to house states, block address, and clobbers (7 in total), so we
+/// registers to to house states, block address, and clobbers (12 in total), so we
 /// can use automatical register allocation.
 #[cfg(target_arch = "x86_64")]
-pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) {
-    let mut temp_state: [u32; 4] = *state;
-
+pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) {
     // SAFETY: inline-assembly
     unsafe {
         asm!(
+            // exit if no block
+            "cmp {cnt}, 0",
+            "jz 3f",
+
+            "2:",
+            // duplicate state vector for this iteration
+            "mov {a:e}, {sa:e}",
+            "mov {b:e}, {sb:e}",
+            "mov {c:e}, {sc:e}",
+            "mov {d:e}, {sd:e}",
+
             asm_md5!(
                 // states
                 {a:e}, {b:e}, {c:e}, {d:e},
@@ -210,41 +219,81 @@ pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) {
                 {t1:e}, {t2:e}
             ),
 
-            // states
-            a = inout(reg) temp_state[0],
-            b = inout(reg) temp_state[1],
-            c = inout(reg) temp_state[2],
-            d = inout(reg) temp_state[3],
+            // update state
+            "add {sa:e}, {a:e}",
+            "add {sb:e}, {b:e}",
+            "add {sc:e}, {c:e}",
+            "add {sd:e}, {d:e}",
+
+            // check end of loop?
+            "dec {cnt}",
+            "jz 3f",
+
+            // advance block pointer
+            // 4 * 16 = 64 bytes
+            "add {x}, 64",
+
+            "jmp 2b",
+
+            // exit
+            "3:",
+
+            // states clobbers
+            a = out(reg) _,
+            b = out(reg) _,
+            c = out(reg) _,
+            d = out(reg) _,
+            // output states
+            sa = inout(reg) state[0],
+            sb = inout(reg) state[1],
+            sc = inout(reg) state[2],
+            sd = inout(reg) state[3],
             // inputs
-            x = in(reg) block.as_ptr(),
+            x = in(reg) blocks.as_ptr(),
+            cnt = in(reg) blocks.len(),
             // clobbers
             t1 = out(reg) _,
             t2 = out(reg) _,
         );
     }
-
-    // update states
-    state[0] = state[0].wrapping_add(temp_state[0]);
-    state[1] = state[1].wrapping_add(temp_state[1]);
-    state[2] = state[2].wrapping_add(temp_state[2]);
-    state[3] = state[3].wrapping_add(temp_state[3]);
 }
 
 /// MD5 compress function. We don't have enough registers to load the whole block,
 /// so we need to use memory address to refer to the inputs. Due to possible failure
 /// of register allocation on `x86`, we explicitly specify registers to use.
 #[cfg(target_arch = "x86")]
-pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) {
-    let mut temp_state: [u32; 4] = *state;
-
+pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) {
     // SAFETY: inline-assembly
     unsafe {
         asm!(
-            // Save esi and ebp
-            "sub esp, 8",
+            // exit if no block
+            "cmp ebx, 0",
+            "jz 4f",
+
+            // save esi and ebp
+            // save state vector address
+            // move block count to stack
+            "sub esp, 32",
             "mov [esp + 0], esi",
             "mov [esp + 4], ebp",
+            // address of `state`
+            "mov [esp + 8], eax",
+            // block count
+            "mov [esp + 12], ebx",
+
+            // we can now use all registers
+            // we will move eax into ebp, save states on stack and set eax-edx as states
+            "mov ebp, eax",
+            "mov eax, [ebp + 0]",
+            "mov [esp + 16], eax",
+            "mov ebx, [ebp + 4]",
+            "mov [esp + 20], ebx",
+            "mov ecx, [ebp + 8]",
+            "mov [esp + 24], ecx",
+            "mov edx, [ebp + 12]",
+            "mov [esp + 28], edx",
 
+            "2:",
             asm_md5!(
                 // states
                 eax, ebx, ecx, edx,
@@ -257,24 +306,55 @@ pub fn compress_block(state: &mut [u32; 4], block: &[u8; 64]) {
                 esi, ebp
             ),
 
-            // Restore esi and ebp
-            "mov ebp, [esp + 4]",
+            // update state
+            "add eax, [esp + 16]",
+            "add ebx, [esp + 20]",
+            "add ecx, [esp + 24]",
+            "add edx, [esp + 28]",
+
+            // check end of loop?
+            "mov esi, [esp + 12]",
+            "dec esi",
+            "jz 3f",
+
+            // save current state to stack
+            "mov [esp + 16], eax",
+            "mov [esp + 20], ebx",
+            "mov [esp + 24], ecx",
+            "mov [esp + 28], edx",
+            "mov [esp + 12], esi",
+
+            // advance block pointer
+            // 4 * 16 = 64 bytes
+            "add edi, 64",
+
+            "jmp 2b",
+
+            "3:",
+            // write to state vector
+            "mov ebp, [esp + 8]",
+            "mov [ebp + 0], eax",
+            "mov [ebp + 4], ebx",
+            "mov [ebp + 8], ecx",
+            "mov [ebp + 12], edx",
+
+            // restore esi and ebp
             "mov esi, [esp + 0]",
-            "add esp, 8",
+            "mov ebp, [esp + 4]",
+            "add esp, 32",
+
+            // exit
+            "4:",
 
             // states
-            inout("eax") temp_state[0],
-            inout("ebx") temp_state[1],
-            inout("ecx") temp_state[2],
-            inout("edx") temp_state[3],
+            inout("eax") state.as_mut_ptr() => _,
             // inputs
-            in("edi") block.as_ptr(),
+            inout("edi") blocks.as_ptr() => _,
+            inout("ebx") blocks.len() => _,
+
+            // clobbers
+            out("ecx") _,
+            out("edx") _,
         );
     }
-
-    // update states
-    state[0] = state[0].wrapping_add(temp_state[0]);
-    state[1] = state[1].wrapping_add(temp_state[1]);
-    state[2] = state[2].wrapping_add(temp_state[2]);
-    state[3] = state[3].wrapping_add(temp_state[3]);
 }