diff --git a/.github/workflows/aes.yml b/.github/workflows/aes.yml
index e7e3fed0..18c4afe6 100644
--- a/.github/workflows/aes.yml
+++ b/.github/workflows/aes.yml
@@ -39,7 +39,6 @@ jobs:
       - run: cargo build --release --target ${{ matrix.target }}
       - run: cargo build --release --target ${{ matrix.target }} --features compact
       - run: cargo build --release --target ${{ matrix.target }} --features ctr
-      - run: cargo build --release --target ${{ matrix.target }} --features force-soft
       - run: cargo build --release --target ${{ matrix.target }} --all-features
 
   # Tests for the portable software backend
@@ -74,7 +73,6 @@ jobs:
       - run: cargo test --release --target ${{ matrix.target }}
       - run: cargo test --release --target ${{ matrix.target }} --features compact
       - run: cargo test --release --target ${{ matrix.target }} --features ctr
-      - run: cargo test --release --target ${{ matrix.target }} --features force-soft
       - run: cargo test --release --target ${{ matrix.target }} --all-features
 
   # Tests for the AES-NI backend
@@ -113,7 +111,6 @@ jobs:
       - run: cargo test --release --target ${{ matrix.target }}
       - run: cargo test --release --target ${{ matrix.target }} --features compact
       - run: cargo test --release --target ${{ matrix.target }} --features ctr
-      - run: cargo test --release --target ${{ matrix.target }} --features force-soft
       - run: cargo test --release --target ${{ matrix.target }} --all-features
 
   # Cross-compiled tests
@@ -147,5 +144,4 @@ jobs:
       - run: cross test --release --target ${{ matrix.target }}
       - run: cross test --release --target ${{ matrix.target }} --features compact
       - run: cross test --release --target ${{ matrix.target }} --features ctr
-      - run: cargo test --release --target ${{ matrix.target }} --features force-soft
       - run: cargo test --release --target ${{ matrix.target }} --all-features
diff --git a/aes/Cargo.toml b/aes/Cargo.toml
index a85585dd..036c44c5 100644
--- a/aes/Cargo.toml
+++ b/aes/Cargo.toml
@@ -27,8 +27,7 @@ cipher = { version = "=0.3.0-pre", features = ["dev"] }
 cpuid-bool = "0.2"
 
 [features]
-compact = [] # Reduce code size at the cost of performance
-force-soft = [] # Disable support for AES hardware intrinsics
+compact = [] # Reduce code size at the cost of slower performance
 
 [package.metadata.docs.rs]
 features = ["ctr"]
diff --git a/aes/src/autodetect.rs b/aes/src/autodetect.rs
index 4ef9789f..8ed55c9f 100644
--- a/aes/src/autodetect.rs
+++ b/aes/src/autodetect.rs
@@ -130,6 +130,7 @@ pub(crate) mod ctr {
             }
 
             mod $module {
+                #[allow(clippy::large_enum_variant)]
                 pub(super) enum Inner {
                     Ni(crate::ni::$name),
                     Soft(crate::soft::$name),
diff --git a/aes/src/lib.rs b/aes/src/lib.rs
index 68a951d6..72c19a9f 100644
--- a/aes/src/lib.rs
+++ b/aes/src/lib.rs
@@ -62,10 +62,7 @@
 use cfg_if::cfg_if;
 
 cfg_if! {
-    if #[cfg(all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        not(feature = "force-soft")
-    ))] {
+    if #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] {
         mod autodetect;
         mod ni;
         mod soft;
diff --git a/aes/src/ni/aes128.rs b/aes/src/ni/aes128.rs
index 6f37cd53..f6f3fcfe 100644
--- a/aes/src/ni/aes128.rs
+++ b/aes/src/ni/aes128.rs
@@ -112,6 +112,7 @@ impl BlockDecrypt for Aes128 {
             // Safety: `loadu` and `storeu` support unaligned access
             #[allow(clippy::cast_ptr_alignment)]
             let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
             b = _mm_xor_si128(b, keys[10]);
             b = _mm_aesdec_si128(b, keys[9]);
             b = _mm_aesdec_si128(b, keys[8]);
@@ -123,6 +124,9 @@ impl BlockDecrypt for Aes128 {
             b = _mm_aesdec_si128(b, keys[2]);
             b = _mm_aesdec_si128(b, keys[1]);
             b = _mm_aesdeclast_si128(b, keys[0]);
+
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
             _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
         }
 
diff --git a/aes/src/ni/aes192.rs b/aes/src/ni/aes192.rs
index 9cd4c345..4e3ac724 100644
--- a/aes/src/ni/aes192.rs
+++ b/aes/src/ni/aes192.rs
@@ -114,6 +114,7 @@ impl BlockDecrypt for Aes192 {
             // Safety: `loadu` and `storeu` support unaligned access
             #[allow(clippy::cast_ptr_alignment)]
             let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
             b = _mm_xor_si128(b, keys[12]);
             b = _mm_aesdec_si128(b, keys[11]);
             b = _mm_aesdec_si128(b, keys[10]);
@@ -127,6 +128,9 @@ impl BlockDecrypt for Aes192 {
             b = _mm_aesdec_si128(b, keys[2]);
             b = _mm_aesdec_si128(b, keys[1]);
             b = _mm_aesdeclast_si128(b, keys[0]);
+
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
             _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
         }
 
diff --git a/aes/src/ni/aes256.rs b/aes/src/ni/aes256.rs
index 17d47a46..4e1a70fc 100644
--- a/aes/src/ni/aes256.rs
+++ b/aes/src/ni/aes256.rs
@@ -118,6 +118,7 @@ impl BlockDecrypt for Aes256 {
             // Safety: `loadu` and `storeu` support unaligned access
             #[allow(clippy::cast_ptr_alignment)]
             let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
             b = _mm_xor_si128(b, keys[14]);
             b = _mm_aesdec_si128(b, keys[13]);
             b = _mm_aesdec_si128(b, keys[12]);
@@ -133,6 +134,9 @@ impl BlockDecrypt for Aes256 {
             b = _mm_aesdec_si128(b, keys[2]);
             b = _mm_aesdec_si128(b, keys[1]);
             b = _mm_aesdeclast_si128(b, keys[0]);
+
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
             _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
         }
 
diff --git a/aes/src/soft/fixslice64.rs b/aes/src/soft/fixslice64.rs
index f4e57fc7..802081e4 100644
--- a/aes/src/soft/fixslice64.rs
+++ b/aes/src/soft/fixslice64.rs
@@ -20,27 +20,28 @@ use cipher::{
     consts::{U16, U24, U32},
     generic_array::GenericArray,
 };
+use core::convert::TryInto;
 
 /// AES block batch size for this implementation
-pub(crate) const FIXSLICE_BLOCKS: usize = 4;
+pub(crate) const FIXSLICE_BLOCKS: usize = 2;
 
 /// AES-128 round keys
-pub(crate) type FixsliceKeys128 = [u64; 88];
+pub(crate) type FixsliceKeys128 = [u32; 88];
 
 /// AES-192 round keys
-pub(crate) type FixsliceKeys192 = [u64; 104];
+pub(crate) type FixsliceKeys192 = [u32; 104];
 
 /// AES-256 round keys
-pub(crate) type FixsliceKeys256 = [u64; 120];
+pub(crate) type FixsliceKeys256 = [u32; 120];
 
-/// 512-bit internal state
-type State = [u64; 8];
+/// 256-bit internal state
+type State = [u32; 8];
 
 /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
 pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 {
-    let mut rkeys = [0u64; 88];
+    let mut rkeys = [0u32; 88];
 
-    bitslice(&mut rkeys[..8], key, key, key, key);
+    bitslice(&mut rkeys[..8], key, key);
 
     let mut rk_off = 0;
     for rcon in 0..10 {
@@ -89,25 +90,19 @@ pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys12
 
 /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
 pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 {
-    let mut rkeys = [0u64; 104];
-    let mut tmp = [0u64; 8];
-
-    bitslice(
-        &mut rkeys[..8],
-        &key[..16],
-        &key[..16],
-        &key[..16],
-        &key[..16],
-    );
-    bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
+    let mut rkeys = [0u32; 104];
+    let mut tmp = [0u32; 8];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut tmp, &key[8..], &key[8..]);
 
     let mut rcon = 0;
     let mut rk_off = 8;
 
     loop {
         for i in 0..8 {
-            rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
-                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
+            rkeys[rk_off + i] =
+                (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
         }
 
         sub_bytes(&mut tmp);
@@ -118,8 +113,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19
 
         for i in 0..8 {
             let mut ti = rkeys[rk_off + i];
-            ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
-            ti ^= 0xf000f000f000f000 & (ti << 4);
+            ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1));
+            ti ^= 0xc0c0c0c0 & (ti << 2);
             tmp[i] = ti;
         }
         rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
@@ -127,13 +122,10 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19
 
         for i in 0..8 {
             let ui = tmp[i];
-            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
-                | (0xff00ff00ff00ff00 & (ui << 8));
-            ti ^= 0x000f000f000f000f & (ui >> 12);
-            tmp[i] = ti
-                ^ (0xfff0fff0fff0fff0 & (ti << 4))
-                ^ (0xff00ff00ff00ff00 & (ti << 8))
-                ^ (0xf000f000f000f000 & (ti << 12));
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4));
+            ti ^= 0x03030303 & (ui >> 6);
+            tmp[i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
         }
         rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
         rk_off += 8;
@@ -145,13 +137,11 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19
         rcon += 1;
 
         for i in 0..8 {
-            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
-                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
-            ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
-            rkeys[rk_off + i] = ti
-                ^ (0xfff0fff0fff0fff0 & (ti << 4))
-                ^ (0xff00ff00ff00ff00 & (ti << 8))
-                ^ (0xf000f000f000f000 & (ti << 12));
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4))
+                | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
+            ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3));
+            rkeys[rk_off + i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
         }
         rk_off += 8;
 
@@ -162,8 +152,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19
         for i in 0..8 {
             let ui = rkeys[(rk_off - 8) + i];
             let mut ti = rkeys[(rk_off - 16) + i];
-            ti ^= 0x0f000f000f000f00 & (ui >> 4);
-            ti ^= 0xf000f000f000f000 & (ti << 4);
+            ti ^= 0x30303030 & (ui >> 2);
+            ti ^= 0xc0c0c0c0 & (ti << 2);
             tmp[i] = ti;
         }
     }
@@ -194,22 +184,10 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19
 
 /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
 pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 {
-    let mut rkeys = [0u64; 120];
-
-    bitslice(
-        &mut rkeys[..8],
-        &key[..16],
-        &key[..16],
-        &key[..16],
-        &key[..16],
-    );
-    bitslice(
-        &mut rkeys[8..16],
-        &key[16..],
-        &key[16..],
-        &key[16..],
-        &key[16..],
-    );
+    let mut rkeys = [0u32; 120];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut rkeys[8..16], &key[16..], &key[16..]);
 
     let mut rk_off = 8;
 
@@ -270,7 +248,7 @@ pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
     let mut state = State::default();
 
-    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+    bitslice(&mut state, &blocks[0], &blocks[1]);
 
     add_round_key(&mut state, &rkeys[80..]);
     inv_sub_bytes(&mut state);
@@ -327,7 +305,7 @@ pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
     let mut state = State::default();
 
-    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+    bitslice(&mut state, &blocks[0], &blocks[1]);
 
     add_round_key(&mut state, &rkeys[..8]);
 
@@ -384,7 +362,7 @@ pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
     let mut state = State::default();
 
-    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+    bitslice(&mut state, &blocks[0], &blocks[1]);
 
     add_round_key(&mut state, &rkeys[96..]);
     inv_sub_bytes(&mut state);
@@ -435,7 +413,7 @@ pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
     let mut state = State::default();
 
-    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+    bitslice(&mut state, &blocks[0], &blocks[1]);
 
     add_round_key(&mut state, &rkeys[..8]);
 
@@ -486,7 +464,7 @@ pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
     let mut state = State::default();
 
-    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+    bitslice(&mut state, &blocks[0], &blocks[1]);
 
     add_round_key(&mut state, &rkeys[112..]);
     inv_sub_bytes(&mut state);
@@ -543,7 +521,7 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
     let mut state = State::default();
 
-    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+    bitslice(&mut state, &blocks[0], &blocks[1]);
 
     add_round_key(&mut state, &rkeys[..8]);
 
@@ -593,9 +571,9 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
     inv_bitslice(&mut state, blocks);
 }
 
-/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true
 /// inverse of 'sub_bytes'.
-fn inv_sub_bytes(state: &mut [u64]) {
+fn inv_sub_bytes(state: &mut [u32]) {
     debug_assert_eq!(state.len(), 8);
 
     // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
@@ -803,8 +781,8 @@ fn inv_sub_bytes(state: &mut [u64]) {
 ///
 /// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
 ///
-/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
-fn sub_bytes(state: &mut [u64]) {
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule.
+fn sub_bytes(state: &mut [u32]) {
     debug_assert_eq!(state.len(), 8);
 
     // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
@@ -977,12 +955,12 @@ fn sub_bytes(state: &mut [u64]) {
 
 /// NOT operations that are omitted in S-box
 #[inline]
-fn sub_bytes_nots(state: &mut [u64]) {
+fn sub_bytes_nots(state: &mut [u32]) {
     debug_assert_eq!(state.len(), 8);
-    state[0] ^= 0xffffffffffffffff;
-    state[1] ^= 0xffffffffffffffff;
-    state[5] ^= 0xffffffffffffffff;
-    state[6] ^= 0xffffffffffffffff;
+    state[0] ^= 0xffffffff;
+    state[1] ^= 0xffffffff;
+    state[5] ^= 0xffffffff;
+    state[6] ^= 0xffffffff;
 }
 
 /// Computation of the MixColumns transformation in the fixsliced representation, with different
@@ -1119,13 +1097,13 @@ define_mix_columns!(
 );
 
 #[inline]
-fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
+fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) {
     let t = (*a ^ ((*a) >> shift)) & mask;
     *a ^= t ^ (t << shift);
 }
 
 #[inline]
-fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
+fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) {
     let t = (*a ^ ((*b) >> shift)) & mask;
     *a ^= t;
     *b ^= t << shift;
@@ -1134,46 +1112,46 @@ fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
 /// Applies ShiftRows once on an AES state (or key).
 #[cfg(not(feature = "compact"))]
 #[inline]
-fn shift_rows_1(state: &mut [u64]) {
+fn shift_rows_1(state: &mut [u32]) {
     debug_assert_eq!(state.len(), 8);
     for x in state.iter_mut() {
-        delta_swap_1(x, 8, 0x00f000ff000f0000);
-        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+        delta_swap_1(x, 4, 0x0c0f0300);
+        delta_swap_1(x, 2, 0x33003300);
     }
 }
 
 /// Applies ShiftRows twice on an AES state (or key).
 #[inline]
-fn shift_rows_2(state: &mut [u64]) {
+fn shift_rows_2(state: &mut [u32]) {
     debug_assert_eq!(state.len(), 8);
     for x in state.iter_mut() {
-        delta_swap_1(x, 8, 0x00ff000000ff0000);
+        delta_swap_1(x, 4, 0x0f000f00);
     }
 }
 
 /// Applies ShiftRows three times on an AES state (or key).
 #[inline]
-fn shift_rows_3(state: &mut [u64]) {
+fn shift_rows_3(state: &mut [u32]) {
     debug_assert_eq!(state.len(), 8);
     for x in state.iter_mut() {
-        delta_swap_1(x, 8, 0x000f00ff00f00000);
-        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+        delta_swap_1(x, 4, 0x030f0c00);
+        delta_swap_1(x, 2, 0x33003300);
     }
 }
 
 #[inline(always)]
-fn inv_shift_rows_1(state: &mut [u64]) {
+fn inv_shift_rows_1(state: &mut [u32]) {
     shift_rows_3(state);
 }
 
 #[inline(always)]
-fn inv_shift_rows_2(state: &mut [u64]) {
+fn inv_shift_rows_2(state: &mut [u32]) {
     shift_rows_2(state);
 }
 
 #[cfg(not(feature = "compact"))]
 #[inline(always)]
-fn inv_shift_rows_3(state: &mut [u64]) {
+fn inv_shift_rows_3(state: &mut [u32]) {
     shift_rows_1(state);
 }
 
@@ -1185,84 +1163,66 @@ fn inv_shift_rows_3(state: &mut [u64]) {
 ///
 /// The `idx_ror` parameter refers to the rotation value, which varies between the
 /// different key schedules.
-fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
+fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) {
     for i in 0..8 {
         let off_i = offset + i;
-        let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
-        rkeys[off_i] = rk
-            ^ (0xfff0fff0fff0fff0 & (rk << 4))
-            ^ (0xff00ff00ff00ff00 & (rk << 8))
-            ^ (0xf000f000f000f000 & (rk << 12));
+        let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror));
+        rkeys[off_i] =
+            rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6));
     }
 }
 
-/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
-fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
+/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state.
+fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) {
     debug_assert_eq!(output.len(), 8);
     debug_assert_eq!(input0.len(), 16);
     debug_assert_eq!(input1.len(), 16);
-    debug_assert_eq!(input2.len(), 16);
-    debug_assert_eq!(input3.len(), 16);
 
-    // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
-    // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an
+    // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
     // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
-    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //     b0 c1 c0 r1 r0 p2 p1 p0
     //
     // The desired bitsliced data groups first by bit position, then row, column, block:
-    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
-
-    #[rustfmt::skip]
-    fn read_reordered(input: &[u8]) -> u64 {
-        (u64::from(input[0x0])        ) |
-        (u64::from(input[0x1]) << 0x10) |
-        (u64::from(input[0x2]) << 0x20) |
-        (u64::from(input[0x3]) << 0x30) |
-        (u64::from(input[0x8]) << 0x08) |
-        (u64::from(input[0x9]) << 0x18) |
-        (u64::from(input[0xa]) << 0x28) |
-        (u64::from(input[0xb]) << 0x38)
-    }
-
-    // Reorder each block's bytes on input
-    //     __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
-    // Reorder by relabeling (note the order of input)
-    //     b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
-    let mut t0 = read_reordered(&input0[0x00..0x0c]);
-    let mut t4 = read_reordered(&input0[0x04..0x10]);
-    let mut t1 = read_reordered(&input1[0x00..0x0c]);
-    let mut t5 = read_reordered(&input1[0x04..0x10]);
-    let mut t2 = read_reordered(&input2[0x00..0x0c]);
-    let mut t6 = read_reordered(&input2[0x04..0x10]);
-    let mut t3 = read_reordered(&input3[0x00..0x0c]);
-    let mut t7 = read_reordered(&input3[0x04..0x10]);
-
-    // Bit Index Swap 6 <-> 0:
-    //     __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
-    let m0 = 0x5555555555555555;
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+
+    // Interleave the columns on input (note the order of input)
+    //     b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __
+    let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap());
+    let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap());
+    let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap());
+    let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap());
+    let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap());
+    let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap());
+    let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap());
+    let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap());
+
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0
+    let m0 = 0x55555555;
     delta_swap_2(&mut t1, &mut t0, 1, m0);
     delta_swap_2(&mut t3, &mut t2, 1, m0);
     delta_swap_2(&mut t5, &mut t4, 1, m0);
     delta_swap_2(&mut t7, &mut t6, 1, m0);
 
-    // Bit Index Swap 7 <-> 1:
-    //     __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
-    let m1 = 0x3333333333333333;
+    // Bit Index Swap 6 <-> 1:
+    //     __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __
+    let m1 = 0x33333333;
     delta_swap_2(&mut t2, &mut t0, 2, m1);
     delta_swap_2(&mut t3, &mut t1, 2, m1);
     delta_swap_2(&mut t6, &mut t4, 2, m1);
     delta_swap_2(&mut t7, &mut t5, 2, m1);
 
-    // Bit Index Swap 8 <-> 2:
-    //     c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
-    let m2 = 0x0f0f0f0f0f0f0f0f;
+    // Bit Index Swap 7 <-> 2:
+    //     c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __
+    let m2 = 0x0f0f0f0f;
     delta_swap_2(&mut t4, &mut t0, 4, m2);
     delta_swap_2(&mut t5, &mut t1, 4, m2);
     delta_swap_2(&mut t6, &mut t2, 4, m2);
     delta_swap_2(&mut t7, &mut t3, 4, m2);
 
     // Final bitsliced bit index, as desired:
-    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+    //     p2 p1 p0 r1 r0 c1 c0 b0
     output[0] = t0;
     output[1] = t1;
     output[2] = t2;
@@ -1273,18 +1233,18 @@ fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], inp
     output[7] = t7;
 }
 
-/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
-fn inv_bitslice(input: &mut [u64], output: &mut [Block]) {
+/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output.
+fn inv_bitslice(input: &mut [u32], output: &mut [Block]) {
     debug_assert_eq!(input.len(), 8);
-    debug_assert_eq!(output.len(), 4);
+    debug_assert_eq!(output.len(), 2);
 
-    // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
-    // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at
+    // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
     // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
-    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //     b0 c1 c0 r1 r0 p2 p1 p0
     //
     // The initially bitsliced data groups first by bit position, then row, column, block:
-    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+    //     p2 p1 p0 r1 r0 c1 c0 b0
 
     let mut t0 = input[0];
     let mut t1 = input[1];
@@ -1297,61 +1257,47 @@ fn inv_bitslice(input: &mut [u64], output: &mut [Block]) {
 
     // TODO: these bit index swaps are identical to those in 'packing'
 
-    // Bit Index Swap 6 <-> 0:
-    //     __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
-    let m0 = 0x5555555555555555;
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0
+    let m0 = 0x55555555;
     delta_swap_2(&mut t1, &mut t0, 1, m0);
     delta_swap_2(&mut t3, &mut t2, 1, m0);
     delta_swap_2(&mut t5, &mut t4, 1, m0);
     delta_swap_2(&mut t7, &mut t6, 1, m0);
 
-    // Bit Index Swap 7 <-> 1:
-    //     __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
-    let m1 = 0x3333333333333333;
+    // Bit Index Swap 6 <-> 1:
+    //     __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __
+    let m1 = 0x33333333;
     delta_swap_2(&mut t2, &mut t0, 2, m1);
     delta_swap_2(&mut t3, &mut t1, 2, m1);
     delta_swap_2(&mut t6, &mut t4, 2, m1);
     delta_swap_2(&mut t7, &mut t5, 2, m1);
 
-    // Bit Index Swap 8 <-> 2:
-    //     p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
-    let m2 = 0x0f0f0f0f0f0f0f0f;
+    // Bit Index Swap 7 <-> 2:
+    //     p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __
+    let m2 = 0x0f0f0f0f;
     delta_swap_2(&mut t4, &mut t0, 4, m2);
     delta_swap_2(&mut t5, &mut t1, 4, m2);
     delta_swap_2(&mut t6, &mut t2, 4, m2);
     delta_swap_2(&mut t7, &mut t3, 4, m2);
 
-    #[rustfmt::skip]
-    fn write_reordered(columns: u64, output: &mut [u8]) {
-        output[0x0] = (columns        ) as u8;
-        output[0x1] = (columns >> 0x10) as u8;
-        output[0x2] = (columns >> 0x20) as u8;
-        output[0x3] = (columns >> 0x30) as u8;
-        output[0x8] = (columns >> 0x08) as u8;
-        output[0x9] = (columns >> 0x18) as u8;
-        output[0xa] = (columns >> 0x28) as u8;
-        output[0xb] = (columns >> 0x38) as u8;
-    }
-
-    // Reorder by relabeling (note the order of output)
-    //     c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
-    // Reorder each block's bytes on output
-    //     __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
-    write_reordered(t0, &mut output[0][0x00..0x0c]);
-    write_reordered(t4, &mut output[0][0x04..0x10]);
-    write_reordered(t1, &mut output[1][0x00..0x0c]);
-    write_reordered(t5, &mut output[1][0x04..0x10]);
-    write_reordered(t2, &mut output[2][0x00..0x0c]);
-    write_reordered(t6, &mut output[2][0x04..0x10]);
-    write_reordered(t3, &mut output[3][0x00..0x0c]);
-    write_reordered(t7, &mut output[3][0x04..0x10]);
+    // De-interleave the columns on output (note the order of output)
+    //     c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __
+    output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes());
+    output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes());
+    output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes());
+    output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes());
+    output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes());
+    output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes());
+    output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes());
+    output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes());
 
     // Final AES bit index, as desired:
-    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //     b0 c1 c0 r1 r0 p2 p1 p0
 }
 
 /// Copy 32-bytes within the provided slice to an 8-byte offset
-fn memshift32(buffer: &mut [u64], src_offset: usize) {
+fn memshift32(buffer: &mut [u32], src_offset: usize) {
     debug_assert_eq!(src_offset % 8, 0);
 
     let dst_offset = src_offset + 8;
@@ -1365,7 +1311,7 @@ fn memshift32(buffer: &mut [u64], src_offset: usize) {
 /// XOR the round key to the internal state. The round keys are expected to be
 /// pre-computed and to be packed in the fixsliced representation.
 #[inline]
-fn add_round_key(state: &mut State, rkey: &[u64]) {
+fn add_round_key(state: &mut State, rkey: &[u32]) {
     debug_assert_eq!(rkey.len(), 8);
     for (a, b) in state.iter_mut().zip(rkey) {
         *a ^= b;
@@ -1373,56 +1319,56 @@ fn add_round_key(state: &mut State, rkey: &[u64]) {
 }
 
 #[inline(always)]
-fn add_round_constant_bit(state: &mut [u64], bit: usize) {
-    state[bit] ^= 0x00000000f0000000;
+fn add_round_constant_bit(state: &mut [u32], bit: usize) {
+    state[bit] ^= 0x0000c000;
 }
 
 #[inline(always)]
-fn ror(x: u64, y: u32) -> u64 {
+fn ror(x: u32, y: u32) -> u32 {
     x.rotate_right(y)
 }
 
 #[inline(always)]
 fn ror_distance(rows: u32, cols: u32) -> u32 {
-    (rows << 4) + (cols << 2)
+    (rows << 3) + (cols << 1)
 }
 
 #[inline(always)]
-fn rotate_rows_1(x: u64) -> u64 {
+fn rotate_rows_1(x: u32) -> u32 {
     ror(x, ror_distance(1, 0))
 }
 
 #[inline(always)]
-fn rotate_rows_2(x: u64) -> u64 {
+fn rotate_rows_2(x: u32) -> u32 {
     ror(x, ror_distance(2, 0))
 }
 
 #[inline(always)]
 #[rustfmt::skip]
-fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
-    (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
-    (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
+fn rotate_rows_and_columns_1_1(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) |
+    (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0)
 }
 
 #[cfg(not(feature = "compact"))]
 #[inline(always)]
 #[rustfmt::skip]
-fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
-    (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
-    (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
+fn rotate_rows_and_columns_1_2(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0)
 }
 
 #[cfg(not(feature = "compact"))]
 #[inline(always)]
 #[rustfmt::skip]
-fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
-    (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
-    (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
+fn rotate_rows_and_columns_1_3(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 3)) & 0x03030303) |
+    (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc)
 }
 
 #[inline(always)]
 #[rustfmt::skip]
-fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
-    (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
-    (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
+fn rotate_rows_and_columns_2_2(x: u32) -> u32 {
+    (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0)
 }