diff --git a/.github/workflows/aes.yml b/.github/workflows/aes.yml index e7e3fed0..18c4afe6 100644 --- a/.github/workflows/aes.yml +++ b/.github/workflows/aes.yml @@ -39,7 +39,6 @@ jobs: - run: cargo build --release --target ${{ matrix.target }} - run: cargo build --release --target ${{ matrix.target }} --features compact - run: cargo build --release --target ${{ matrix.target }} --features ctr - - run: cargo build --release --target ${{ matrix.target }} --features force-soft - run: cargo build --release --target ${{ matrix.target }} --all-features # Tests for the portable software backend @@ -74,7 +73,6 @@ jobs: - run: cargo test --release --target ${{ matrix.target }} - run: cargo test --release --target ${{ matrix.target }} --features compact - run: cargo test --release --target ${{ matrix.target }} --features ctr - - run: cargo test --release --target ${{ matrix.target }} --features force-soft - run: cargo test --release --target ${{ matrix.target }} --all-features # Tests for the AES-NI backend @@ -113,7 +111,6 @@ jobs: - run: cargo test --release --target ${{ matrix.target }} - run: cargo test --release --target ${{ matrix.target }} --features compact - run: cargo test --release --target ${{ matrix.target }} --features ctr - - run: cargo test --release --target ${{ matrix.target }} --features force-soft - run: cargo test --release --target ${{ matrix.target }} --all-features # Cross-compiled tests @@ -147,5 +144,4 @@ jobs: - run: cross test --release --target ${{ matrix.target }} - run: cross test --release --target ${{ matrix.target }} --features compact - run: cross test --release --target ${{ matrix.target }} --features ctr - - run: cargo test --release --target ${{ matrix.target }} --features force-soft - run: cargo test --release --target ${{ matrix.target }} --all-features diff --git a/aes/Cargo.toml b/aes/Cargo.toml index a85585dd..036c44c5 100644 --- a/aes/Cargo.toml +++ b/aes/Cargo.toml @@ -27,8 +27,7 @@ cipher = { version = "=0.3.0-pre", features = ["dev"] } cpuid-bool = "0.2" [features] -compact = [] # Reduce code size at the cost of performance -force-soft = [] # Disable support for AES hardware intrinsics +compact = [] # Reduce code size at the cost of slower performance [package.metadata.docs.rs] features = ["ctr"] diff --git a/aes/src/autodetect.rs b/aes/src/autodetect.rs index 4ef9789f..8ed55c9f 100644 --- a/aes/src/autodetect.rs +++ b/aes/src/autodetect.rs @@ -130,6 +130,7 @@ pub(crate) mod ctr { } mod $module { + #[allow(clippy::large_enum_variant)] pub(super) enum Inner { Ni(crate::ni::$name), Soft(crate::soft::$name), diff --git a/aes/src/lib.rs b/aes/src/lib.rs index 68a951d6..72c19a9f 100644 --- a/aes/src/lib.rs +++ b/aes/src/lib.rs @@ -62,10 +62,7 @@ use cfg_if::cfg_if; cfg_if! { - if #[cfg(all( - any(target_arch = "x86_64", target_arch = "x86"), - not(feature = "force-soft") - ))] { + if #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { mod autodetect; mod ni; mod soft; diff --git a/aes/src/ni/aes128.rs b/aes/src/ni/aes128.rs index 6f37cd53..f6f3fcfe 100644 --- a/aes/src/ni/aes128.rs +++ b/aes/src/ni/aes128.rs @@ -112,6 +112,7 @@ impl BlockDecrypt for Aes128 { // Safety: `loadu` and `storeu` support unaligned access #[allow(clippy::cast_ptr_alignment)] let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + b = _mm_xor_si128(b, keys[10]); b = _mm_aesdec_si128(b, keys[9]); b = _mm_aesdec_si128(b, keys[8]); @@ -123,6 +124,9 @@ impl BlockDecrypt for Aes128 { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); } diff --git a/aes/src/ni/aes192.rs b/aes/src/ni/aes192.rs index 9cd4c345..4e3ac724 100644 --- a/aes/src/ni/aes192.rs +++ b/aes/src/ni/aes192.rs @@ -114,6 +114,7 @@ impl BlockDecrypt for Aes192 { // Safety: `loadu` and `storeu` support unaligned access #[allow(clippy::cast_ptr_alignment)] let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + b = _mm_xor_si128(b, keys[12]); b = _mm_aesdec_si128(b, keys[11]); b = _mm_aesdec_si128(b, keys[10]); @@ -127,6 +128,9 @@ impl BlockDecrypt for Aes192 { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); } diff --git a/aes/src/ni/aes256.rs b/aes/src/ni/aes256.rs index 17d47a46..4e1a70fc 100644 --- a/aes/src/ni/aes256.rs +++ b/aes/src/ni/aes256.rs @@ -118,6 +118,7 @@ impl BlockDecrypt for Aes256 { // Safety: `loadu` and `storeu` support unaligned access #[allow(clippy::cast_ptr_alignment)] let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + b = _mm_xor_si128(b, keys[14]); b = _mm_aesdec_si128(b, keys[13]); b = _mm_aesdec_si128(b, keys[12]); @@ -133,6 +134,9 @@ impl BlockDecrypt for Aes256 { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); } diff --git a/aes/src/soft/fixslice64.rs b/aes/src/soft/fixslice64.rs index f4e57fc7..802081e4 100644 --- a/aes/src/soft/fixslice64.rs +++ b/aes/src/soft/fixslice64.rs @@ -20,27 +20,28 @@ use cipher::{ consts::{U16, U24, U32}, generic_array::GenericArray, }; +use core::convert::TryInto; /// AES block batch size for this implementation -pub(crate) const FIXSLICE_BLOCKS: usize = 4; +pub(crate) const FIXSLICE_BLOCKS: usize = 2; /// AES-128 round keys -pub(crate) type FixsliceKeys128 = [u64; 88]; +pub(crate) type FixsliceKeys128 = [u32; 88]; /// AES-192 round keys -pub(crate) type FixsliceKeys192 = [u64; 104]; +pub(crate) type FixsliceKeys192 = [u32; 104]; /// AES-256 round keys -pub(crate) type FixsliceKeys256 = [u64; 120]; +pub(crate) type FixsliceKeys256 = [u32; 120]; -/// 512-bit internal state -type State = [u64; 8]; +/// 256-bit internal state +type State = [u32; 8]; /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. pub(crate) fn aes128_key_schedule(key: &GenericArray) -> FixsliceKeys128 { - let mut rkeys = [0u64; 88]; + let mut rkeys = [0u32; 88]; - bitslice(&mut rkeys[..8], key, key, key, key); + bitslice(&mut rkeys[..8], key, key); let mut rk_off = 0; for rcon in 0..10 { @@ -89,25 +90,19 @@ pub(crate) fn aes128_key_schedule(key: &GenericArray) -> FixsliceKeys12 /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys192 { - let mut rkeys = [0u64; 104]; - let mut tmp = [0u64; 8]; - - bitslice( - &mut rkeys[..8], - &key[..16], - &key[..16], - &key[..16], - &key[..16], - ); - bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]); + let mut rkeys = [0u32; 104]; + let mut tmp = [0u32; 8]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut tmp, &key[8..], &key[8..]); let mut rcon = 0; let mut rk_off = 8; loop { for i in 0..8 { - rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8)) - | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); + rkeys[rk_off + i] = + (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); } sub_bytes(&mut tmp); @@ -118,8 +113,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let mut ti = rkeys[rk_off + i]; - ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1)); - ti ^= 0xf000f000f000f000 & (ti << 4); + ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1)); + ti ^= 0xc0c0c0c0 & (ti << 2); tmp[i] = ti; } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); @@ -127,13 +122,10 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let ui = tmp[i]; - let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) - | (0xff00ff00ff00ff00 & (ui << 8)); - ti ^= 0x000f000f000f000f & (ui >> 12); - tmp[i] = ti - ^ (0xfff0fff0fff0fff0 & (ti << 4)) - ^ (0xff00ff00ff00ff00 & (ti << 8)) - ^ (0xf000f000f000f000 & (ti << 12)); + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4)); + ti ^= 0x03030303 & (ui >> 6); + tmp[i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); rk_off += 8; @@ -145,13 +137,11 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 rcon += 1; for i in 0..8 { - let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) - | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); - ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3)); - rkeys[rk_off + i] = ti - ^ (0xfff0fff0fff0fff0 & (ti << 4)) - ^ (0xff00ff00ff00ff00 & (ti << 8)) - ^ (0xf000f000f000f000 & (ti << 12)); + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) + | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); + ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3)); + rkeys[rk_off + i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); } rk_off += 8; @@ -162,8 +152,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let ui = rkeys[(rk_off - 8) + i]; let mut ti = rkeys[(rk_off - 16) + i]; - ti ^= 0x0f000f000f000f00 & (ui >> 4); - ti ^= 0xf000f000f000f000 & (ti << 4); + ti ^= 0x30303030 & (ui >> 2); + ti ^= 0xc0c0c0c0 & (ti << 2); tmp[i] = ti; } } @@ -194,22 +184,10 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. pub(crate) fn aes256_key_schedule(key: &GenericArray) -> FixsliceKeys256 { - let mut rkeys = [0u64; 120]; - - bitslice( - &mut rkeys[..8], - &key[..16], - &key[..16], - &key[..16], - &key[..16], - ); - bitslice( - &mut rkeys[8..16], - &key[16..], - &key[16..], - &key[16..], - &key[16..], - ); + let mut rkeys = [0u32; 120]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut rkeys[8..16], &key[16..], &key[16..]); let mut rk_off = 8; @@ -270,7 +248,7 @@ pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[80..]); inv_sub_bytes(&mut state); @@ -327,7 +305,7 @@ pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); @@ -384,7 +362,7 @@ pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[96..]); inv_sub_bytes(&mut state); @@ -435,7 +413,7 @@ pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); @@ -486,7 +464,7 @@ pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[112..]); inv_sub_bytes(&mut state); @@ -543,7 +521,7 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); @@ -593,9 +571,9 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { inv_bitslice(&mut state, blocks); } -/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true +/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true /// inverse of 'sub_bytes'. -fn inv_sub_bytes(state: &mut [u64]) { +fn inv_sub_bytes(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler @@ -803,8 +781,8 @@ fn inv_sub_bytes(state: &mut [u64]) { /// /// See: /// -/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule. -fn sub_bytes(state: &mut [u64]) { +/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule. +fn sub_bytes(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler @@ -977,12 +955,12 @@ fn sub_bytes(state: &mut [u64]) { /// NOT operations that are omitted in S-box #[inline] -fn sub_bytes_nots(state: &mut [u64]) { +fn sub_bytes_nots(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); - state[0] ^= 0xffffffffffffffff; - state[1] ^= 0xffffffffffffffff; - state[5] ^= 0xffffffffffffffff; - state[6] ^= 0xffffffffffffffff; + state[0] ^= 0xffffffff; + state[1] ^= 0xffffffff; + state[5] ^= 0xffffffff; + state[6] ^= 0xffffffff; } /// Computation of the MixColumns transformation in the fixsliced representation, with different @@ -1119,13 +1097,13 @@ define_mix_columns!( ); #[inline] -fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) { +fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) { let t = (*a ^ ((*a) >> shift)) & mask; *a ^= t ^ (t << shift); } #[inline] -fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { +fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) { let t = (*a ^ ((*b) >> shift)) & mask; *a ^= t; *b ^= t << shift; @@ -1134,46 +1112,46 @@ fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { /// Applies ShiftRows once on an AES state (or key). #[cfg(not(feature = "compact"))] #[inline] -fn shift_rows_1(state: &mut [u64]) { +fn shift_rows_1(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 8, 0x00f000ff000f0000); - delta_swap_1(x, 4, 0x0f0f00000f0f0000); + delta_swap_1(x, 4, 0x0c0f0300); + delta_swap_1(x, 2, 0x33003300); } } /// Applies ShiftRows twice on an AES state (or key). #[inline] -fn shift_rows_2(state: &mut [u64]) { +fn shift_rows_2(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 8, 0x00ff000000ff0000); + delta_swap_1(x, 4, 0x0f000f00); } } /// Applies ShiftRows three times on an AES state (or key). #[inline] -fn shift_rows_3(state: &mut [u64]) { +fn shift_rows_3(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 8, 0x000f00ff00f00000); - delta_swap_1(x, 4, 0x0f0f00000f0f0000); + delta_swap_1(x, 4, 0x030f0c00); + delta_swap_1(x, 2, 0x33003300); } } #[inline(always)] -fn inv_shift_rows_1(state: &mut [u64]) { +fn inv_shift_rows_1(state: &mut [u32]) { shift_rows_3(state); } #[inline(always)] -fn inv_shift_rows_2(state: &mut [u64]) { +fn inv_shift_rows_2(state: &mut [u32]) { shift_rows_2(state); } #[cfg(not(feature = "compact"))] #[inline(always)] -fn inv_shift_rows_3(state: &mut [u64]) { +fn inv_shift_rows_3(state: &mut [u32]) { shift_rows_1(state); } @@ -1185,84 +1163,66 @@ fn inv_shift_rows_3(state: &mut [u64]) { /// /// The `idx_ror` parameter refers to the rotation value, which varies between the /// different key schedules. -fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) { +fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) { for i in 0..8 { let off_i = offset + i; - let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror)); - rkeys[off_i] = rk - ^ (0xfff0fff0fff0fff0 & (rk << 4)) - ^ (0xff00ff00ff00ff00 & (rk << 8)) - ^ (0xf000f000f000f000 & (rk << 12)); + let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror)); + rkeys[off_i] = + rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6)); } } -/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state. -fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) { +/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state. +fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) { debug_assert_eq!(output.len(), 8); debug_assert_eq!(input0.len(), 16); debug_assert_eq!(input1.len(), 16); - debug_assert_eq!(input2.len(), 16); - debug_assert_eq!(input3.len(), 16); - // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a - // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an + // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): - // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // b0 c1 c0 r1 r0 p2 p1 p0 // // The desired bitsliced data groups first by bit position, then row, column, block: - // p2 p1 p0 r1 r0 c1 c0 b1 b0 - - #[rustfmt::skip] - fn read_reordered(input: &[u8]) -> u64 { - (u64::from(input[0x0]) ) | - (u64::from(input[0x1]) << 0x10) | - (u64::from(input[0x2]) << 0x20) | - (u64::from(input[0x3]) << 0x30) | - (u64::from(input[0x8]) << 0x08) | - (u64::from(input[0x9]) << 0x18) | - (u64::from(input[0xa]) << 0x28) | - (u64::from(input[0xb]) << 0x38) - } - - // Reorder each block's bytes on input - // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __ - // Reorder by relabeling (note the order of input) - // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __ - let mut t0 = read_reordered(&input0[0x00..0x0c]); - let mut t4 = read_reordered(&input0[0x04..0x10]); - let mut t1 = read_reordered(&input1[0x00..0x0c]); - let mut t5 = read_reordered(&input1[0x04..0x10]); - let mut t2 = read_reordered(&input2[0x00..0x0c]); - let mut t6 = read_reordered(&input2[0x04..0x10]); - let mut t3 = read_reordered(&input3[0x00..0x0c]); - let mut t7 = read_reordered(&input3[0x04..0x10]); - - // Bit Index Swap 6 <-> 0: - // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0 - let m0 = 0x5555555555555555; + // p2 p1 p0 r1 r0 c1 c0 b0 + + // Interleave the columns on input (note the order of input) + // b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __ + let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap()); + let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap()); + let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap()); + let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap()); + let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap()); + let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap()); + let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap()); + let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap()); + + // Bit Index Swap 5 <-> 0: + // __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0 + let m0 = 0x55555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); - // Bit Index Swap 7 <-> 1: - // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __ - let m1 = 0x3333333333333333; + // Bit Index Swap 6 <-> 1: + // __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __ + let m1 = 0x33333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); - // Bit Index Swap 8 <-> 2: - // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __ - let m2 = 0x0f0f0f0f0f0f0f0f; + // Bit Index Swap 7 <-> 2: + // c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __ + let m2 = 0x0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); // Final bitsliced bit index, as desired: - // p2 p1 p0 r1 r0 c1 c0 b1 b0 + // p2 p1 p0 r1 r0 c1 c0 b0 output[0] = t0; output[1] = t1; output[2] = t2; @@ -1273,18 +1233,18 @@ fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], inp output[7] = t7; } -/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output. -fn inv_bitslice(input: &mut [u64], output: &mut [Block]) { +/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output. +fn inv_bitslice(input: &mut [u32], output: &mut [Block]) { debug_assert_eq!(input.len(), 8); - debug_assert_eq!(output.len(), 4); + debug_assert_eq!(output.len(), 2); - // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at - // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at + // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): - // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // b0 c1 c0 r1 r0 p2 p1 p0 // // The initially bitsliced data groups first by bit position, then row, column, block: - // p2 p1 p0 r1 r0 c1 c0 b1 b0 + // p2 p1 p0 r1 r0 c1 c0 b0 let mut t0 = input[0]; let mut t1 = input[1]; @@ -1297,61 +1257,47 @@ fn inv_bitslice(input: &mut [u64], output: &mut [Block]) { // TODO: these bit index swaps are identical to those in 'packing' - // Bit Index Swap 6 <-> 0: - // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0 - let m0 = 0x5555555555555555; + // Bit Index Swap 5 <-> 0: + // __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0 + let m0 = 0x55555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); - // Bit Index Swap 7 <-> 1: - // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __ - let m1 = 0x3333333333333333; + // Bit Index Swap 6 <-> 1: + // __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __ + let m1 = 0x33333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); - // Bit Index Swap 8 <-> 2: - // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __ - let m2 = 0x0f0f0f0f0f0f0f0f; + // Bit Index Swap 7 <-> 2: + // p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __ + let m2 = 0x0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); - #[rustfmt::skip] - fn write_reordered(columns: u64, output: &mut [u8]) { - output[0x0] = (columns ) as u8; - output[0x1] = (columns >> 0x10) as u8; - output[0x2] = (columns >> 0x20) as u8; - output[0x3] = (columns >> 0x30) as u8; - output[0x8] = (columns >> 0x08) as u8; - output[0x9] = (columns >> 0x18) as u8; - output[0xa] = (columns >> 0x28) as u8; - output[0xb] = (columns >> 0x38) as u8; - } - - // Reorder by relabeling (note the order of output) - // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __ - // Reorder each block's bytes on output - // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __ - write_reordered(t0, &mut output[0][0x00..0x0c]); - write_reordered(t4, &mut output[0][0x04..0x10]); - write_reordered(t1, &mut output[1][0x00..0x0c]); - write_reordered(t5, &mut output[1][0x04..0x10]); - write_reordered(t2, &mut output[2][0x00..0x0c]); - write_reordered(t6, &mut output[2][0x04..0x10]); - write_reordered(t3, &mut output[3][0x00..0x0c]); - write_reordered(t7, &mut output[3][0x04..0x10]); + // De-interleave the columns on output (note the order of output) + // c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __ + output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes()); + output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes()); + output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes()); + output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes()); + output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes()); + output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes()); + output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes()); + output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes()); // Final AES bit index, as desired: - // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // b0 c1 c0 r1 r0 p2 p1 p0 } /// Copy 32-bytes within the provided slice to an 8-byte offset -fn memshift32(buffer: &mut [u64], src_offset: usize) { +fn memshift32(buffer: &mut [u32], src_offset: usize) { debug_assert_eq!(src_offset % 8, 0); let dst_offset = src_offset + 8; @@ -1365,7 +1311,7 @@ fn memshift32(buffer: &mut [u64], src_offset: usize) { /// XOR the round key to the internal state. The round keys are expected to be /// pre-computed and to be packed in the fixsliced representation. #[inline] -fn add_round_key(state: &mut State, rkey: &[u64]) { +fn add_round_key(state: &mut State, rkey: &[u32]) { debug_assert_eq!(rkey.len(), 8); for (a, b) in state.iter_mut().zip(rkey) { *a ^= b; @@ -1373,56 +1319,56 @@ fn add_round_key(state: &mut State, rkey: &[u64]) { } #[inline(always)] -fn add_round_constant_bit(state: &mut [u64], bit: usize) { - state[bit] ^= 0x00000000f0000000; +fn add_round_constant_bit(state: &mut [u32], bit: usize) { + state[bit] ^= 0x0000c000; } #[inline(always)] -fn ror(x: u64, y: u32) -> u64 { +fn ror(x: u32, y: u32) -> u32 { x.rotate_right(y) } #[inline(always)] fn ror_distance(rows: u32, cols: u32) -> u32 { - (rows << 4) + (cols << 2) + (rows << 3) + (cols << 1) } #[inline(always)] -fn rotate_rows_1(x: u64) -> u64 { +fn rotate_rows_1(x: u32) -> u32 { ror(x, ror_distance(1, 0)) } #[inline(always)] -fn rotate_rows_2(x: u64) -> u64 { +fn rotate_rows_2(x: u32) -> u32 { ror(x, ror_distance(2, 0)) } #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_1(x: u64) -> u64 { - (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) | - (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000) +fn rotate_rows_and_columns_1_1(x: u32) -> u32 { + (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) | + (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0) } #[cfg(not(feature = "compact"))] #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_2(x: u64) -> u64 { - (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) | - (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00) +fn rotate_rows_and_columns_1_2(x: u32) -> u32 { + (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0) } #[cfg(not(feature = "compact"))] #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_3(x: u64) -> u64 { - (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) | - (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0) +fn rotate_rows_and_columns_1_3(x: u32) -> u32 { + (ror(x, ror_distance(1, 3)) & 0x03030303) | + (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc) } #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_2_2(x: u64) -> u64 { - (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) | - (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00) +fn rotate_rows_and_columns_2_2(x: u32) -> u32 { + (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0) }