diff --git a/aes/src/soft/fixslice32.rs b/aes/src/soft/fixslice32.rs index 802081e4..77e87a0d 100644 --- a/aes/src/soft/fixslice32.rs +++ b/aes/src/soft/fixslice32.rs @@ -1,5 +1,5 @@ -//! Fixsliced implementations of AES-128, AES-192 and AES-256 -//! adapted from the C implementation. +//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit) +//! adapted from the C implementation //! //! All implementations are fully bitsliced and do not rely on any //! Look-Up Table (LUT). diff --git a/aes/src/soft/fixslice64.rs b/aes/src/soft/fixslice64.rs index 802081e4..9fd0cc85 100644 --- a/aes/src/soft/fixslice64.rs +++ b/aes/src/soft/fixslice64.rs @@ -1,4 +1,4 @@ -//! Fixsliced implementations of AES-128, AES-192 and AES-256 +//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit) //! adapted from the C implementation. //! //! All implementations are fully bitsliced and do not rely on any @@ -20,28 +20,27 @@ use cipher::{ consts::{U16, U24, U32}, generic_array::GenericArray, }; -use core::convert::TryInto; /// AES block batch size for this implementation -pub(crate) const FIXSLICE_BLOCKS: usize = 2; +pub(crate) const FIXSLICE_BLOCKS: usize = 4; /// AES-128 round keys -pub(crate) type FixsliceKeys128 = [u32; 88]; +pub(crate) type FixsliceKeys128 = [u64; 88]; /// AES-192 round keys -pub(crate) type FixsliceKeys192 = [u32; 104]; +pub(crate) type FixsliceKeys192 = [u64; 104]; /// AES-256 round keys -pub(crate) type FixsliceKeys256 = [u32; 120]; +pub(crate) type FixsliceKeys256 = [u64; 120]; -/// 256-bit internal state -type State = [u32; 8]; +/// 512-bit internal state +type State = [u64; 8]; /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. pub(crate) fn aes128_key_schedule(key: &GenericArray) -> FixsliceKeys128 { - let mut rkeys = [0u32; 88]; + let mut rkeys = [0u64; 88]; - bitslice(&mut rkeys[..8], key, key); + bitslice(&mut rkeys[..8], key, key, key, key); let mut rk_off = 0; for rcon in 0..10 { @@ -90,19 +89,25 @@ pub(crate) fn aes128_key_schedule(key: &GenericArray) -> FixsliceKeys12 /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys192 { - let mut rkeys = [0u32; 104]; - let mut tmp = [0u32; 8]; - - bitslice(&mut rkeys[..8], &key[..16], &key[..16]); - bitslice(&mut tmp, &key[8..], &key[8..]); + let mut rkeys = [0u64; 104]; + let mut tmp = [0u64; 8]; + + bitslice( + &mut rkeys[..8], + &key[..16], + &key[..16], + &key[..16], + &key[..16], + ); + bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]); let mut rcon = 0; let mut rk_off = 8; loop { for i in 0..8 { - rkeys[rk_off + i] = - (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); + rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8)) + | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); } sub_bytes(&mut tmp); @@ -113,8 +118,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let mut ti = rkeys[rk_off + i]; - ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1)); - ti ^= 0xc0c0c0c0 & (ti << 2); + ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1)); + ti ^= 0xf000f000f000f000 & (ti << 4); tmp[i] = ti; } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); @@ -122,10 +127,13 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let ui = tmp[i]; - let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4)); - ti ^= 0x03030303 & (ui >> 6); - tmp[i] = - ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); + let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) + | (0xff00ff00ff00ff00 & (ui << 8)); + ti ^= 0x000f000f000f000f & (ui >> 12); + tmp[i] = ti + ^ (0xfff0fff0fff0fff0 & (ti << 4)) + ^ (0xff00ff00ff00ff00 & (ti << 8)) + ^ (0xf000f000f000f000 & (ti << 12)); } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); rk_off += 8; @@ -137,11 +145,13 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 rcon += 1; for i in 0..8 { - let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) - | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); - ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3)); - rkeys[rk_off + i] = - ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); + let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) + | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); + ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3)); + rkeys[rk_off + i] = ti + ^ (0xfff0fff0fff0fff0 & (ti << 4)) + ^ (0xff00ff00ff00ff00 & (ti << 8)) + ^ (0xf000f000f000f000 & (ti << 12)); } rk_off += 8; @@ -152,8 +162,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let ui = rkeys[(rk_off - 8) + i]; let mut ti = rkeys[(rk_off - 16) + i]; - ti ^= 0x30303030 & (ui >> 2); - ti ^= 0xc0c0c0c0 & (ti << 2); + ti ^= 0x0f000f000f000f00 & (ui >> 4); + ti ^= 0xf000f000f000f000 & (ti << 4); tmp[i] = ti; } } @@ -184,10 +194,22 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. pub(crate) fn aes256_key_schedule(key: &GenericArray) -> FixsliceKeys256 { - let mut rkeys = [0u32; 120]; - - bitslice(&mut rkeys[..8], &key[..16], &key[..16]); - bitslice(&mut rkeys[8..16], &key[16..], &key[16..]); + let mut rkeys = [0u64; 120]; + + bitslice( + &mut rkeys[..8], + &key[..16], + &key[..16], + &key[..16], + &key[..16], + ); + bitslice( + &mut rkeys[8..16], + &key[16..], + &key[16..], + &key[16..], + &key[16..], + ); let mut rk_off = 8; @@ -248,7 +270,7 @@ pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1]); + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); add_round_key(&mut state, &rkeys[80..]); inv_sub_bytes(&mut state); @@ -305,7 +327,7 @@ pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1]); + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); add_round_key(&mut state, &rkeys[..8]); @@ -362,7 +384,7 @@ pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1]); + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); add_round_key(&mut state, &rkeys[96..]); inv_sub_bytes(&mut state); @@ -413,7 +435,7 @@ pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1]); + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); add_round_key(&mut state, &rkeys[..8]); @@ -464,7 +486,7 @@ pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1]); + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); add_round_key(&mut state, &rkeys[112..]); inv_sub_bytes(&mut state); @@ -521,7 +543,7 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1]); + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); add_round_key(&mut state, &rkeys[..8]); @@ -571,9 +593,9 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { inv_bitslice(&mut state, blocks); } -/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true +/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true /// inverse of 'sub_bytes'. -fn inv_sub_bytes(state: &mut [u32]) { +fn inv_sub_bytes(state: &mut [u64]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler @@ -781,8 +803,8 @@ fn inv_sub_bytes(state: &mut [u32]) { /// /// See: /// -/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule. -fn sub_bytes(state: &mut [u32]) { +/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule. +fn sub_bytes(state: &mut [u64]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler @@ -955,12 +977,12 @@ fn sub_bytes(state: &mut [u32]) { /// NOT operations that are omitted in S-box #[inline] -fn sub_bytes_nots(state: &mut [u32]) { +fn sub_bytes_nots(state: &mut [u64]) { debug_assert_eq!(state.len(), 8); - state[0] ^= 0xffffffff; - state[1] ^= 0xffffffff; - state[5] ^= 0xffffffff; - state[6] ^= 0xffffffff; + state[0] ^= 0xffffffffffffffff; + state[1] ^= 0xffffffffffffffff; + state[5] ^= 0xffffffffffffffff; + state[6] ^= 0xffffffffffffffff; } /// Computation of the MixColumns transformation in the fixsliced representation, with different @@ -1097,13 +1119,13 @@ define_mix_columns!( ); #[inline] -fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) { +fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) { let t = (*a ^ ((*a) >> shift)) & mask; *a ^= t ^ (t << shift); } #[inline] -fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) { +fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { let t = (*a ^ ((*b) >> shift)) & mask; *a ^= t; *b ^= t << shift; @@ -1112,46 +1134,46 @@ fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) { /// Applies ShiftRows once on an AES state (or key). #[cfg(not(feature = "compact"))] #[inline] -fn shift_rows_1(state: &mut [u32]) { +fn shift_rows_1(state: &mut [u64]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 4, 0x0c0f0300); - delta_swap_1(x, 2, 0x33003300); + delta_swap_1(x, 8, 0x00f000ff000f0000); + delta_swap_1(x, 4, 0x0f0f00000f0f0000); } } /// Applies ShiftRows twice on an AES state (or key). #[inline] -fn shift_rows_2(state: &mut [u32]) { +fn shift_rows_2(state: &mut [u64]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 4, 0x0f000f00); + delta_swap_1(x, 8, 0x00ff000000ff0000); } } /// Applies ShiftRows three times on an AES state (or key). #[inline] -fn shift_rows_3(state: &mut [u32]) { +fn shift_rows_3(state: &mut [u64]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 4, 0x030f0c00); - delta_swap_1(x, 2, 0x33003300); + delta_swap_1(x, 8, 0x000f00ff00f00000); + delta_swap_1(x, 4, 0x0f0f00000f0f0000); } } #[inline(always)] -fn inv_shift_rows_1(state: &mut [u32]) { +fn inv_shift_rows_1(state: &mut [u64]) { shift_rows_3(state); } #[inline(always)] -fn inv_shift_rows_2(state: &mut [u32]) { +fn inv_shift_rows_2(state: &mut [u64]) { shift_rows_2(state); } #[cfg(not(feature = "compact"))] #[inline(always)] -fn inv_shift_rows_3(state: &mut [u32]) { +fn inv_shift_rows_3(state: &mut [u64]) { shift_rows_1(state); } @@ -1163,66 +1185,84 @@ fn inv_shift_rows_3(state: &mut [u32]) { /// /// The `idx_ror` parameter refers to the rotation value, which varies between the /// different key schedules. -fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) { +fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) { for i in 0..8 { let off_i = offset + i; - let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror)); - rkeys[off_i] = - rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6)); + let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror)); + rkeys[off_i] = rk + ^ (0xfff0fff0fff0fff0 & (rk << 4)) + ^ (0xff00ff00ff00ff00 & (rk << 8)) + ^ (0xf000f000f000f000 & (rk << 12)); } } -/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state. -fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) { +/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state. +fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) { debug_assert_eq!(output.len(), 8); debug_assert_eq!(input0.len(), 16); debug_assert_eq!(input1.len(), 16); + debug_assert_eq!(input2.len(), 16); + debug_assert_eq!(input3.len(), 16); - // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an - // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the + // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a + // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): - // b0 c1 c0 r1 r0 p2 p1 p0 + // b1 b0 c1 c0 r1 r0 p2 p1 p0 // // The desired bitsliced data groups first by bit position, then row, column, block: - // p2 p1 p0 r1 r0 c1 c0 b0 - - // Interleave the columns on input (note the order of input) - // b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __ - let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap()); - let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap()); - let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap()); - let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap()); - let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap()); - let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap()); - let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap()); - let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap()); - - // Bit Index Swap 5 <-> 0: - // __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0 - let m0 = 0x55555555; + // p2 p1 p0 r1 r0 c1 c0 b1 b0 + + #[rustfmt::skip] + fn read_reordered(input: &[u8]) -> u64 { + (u64::from(input[0x0]) ) | + (u64::from(input[0x1]) << 0x10) | + (u64::from(input[0x2]) << 0x20) | + (u64::from(input[0x3]) << 0x30) | + (u64::from(input[0x8]) << 0x08) | + (u64::from(input[0x9]) << 0x18) | + (u64::from(input[0xa]) << 0x28) | + (u64::from(input[0xb]) << 0x38) + } + + // Reorder each block's bytes on input + // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __ + // Reorder by relabeling (note the order of input) + // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __ + let mut t0 = read_reordered(&input0[0x00..0x0c]); + let mut t4 = read_reordered(&input0[0x04..0x10]); + let mut t1 = read_reordered(&input1[0x00..0x0c]); + let mut t5 = read_reordered(&input1[0x04..0x10]); + let mut t2 = read_reordered(&input2[0x00..0x0c]); + let mut t6 = read_reordered(&input2[0x04..0x10]); + let mut t3 = read_reordered(&input3[0x00..0x0c]); + let mut t7 = read_reordered(&input3[0x04..0x10]); + + // Bit Index Swap 6 <-> 0: + // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0 + let m0 = 0x5555555555555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); - // Bit Index Swap 6 <-> 1: - // __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __ - let m1 = 0x33333333; + // Bit Index Swap 7 <-> 1: + // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __ + let m1 = 0x3333333333333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); - // Bit Index Swap 7 <-> 2: - // c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __ - let m2 = 0x0f0f0f0f; + // Bit Index Swap 8 <-> 2: + // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __ + let m2 = 0x0f0f0f0f0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); // Final bitsliced bit index, as desired: - // p2 p1 p0 r1 r0 c1 c0 b0 + // p2 p1 p0 r1 r0 c1 c0 b1 b0 output[0] = t0; output[1] = t1; output[2] = t2; @@ -1233,18 +1273,18 @@ fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) { output[7] = t7; } -/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output. -fn inv_bitslice(input: &mut [u32], output: &mut [Block]) { +/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output. +fn inv_bitslice(input: &mut [u64], output: &mut [Block]) { debug_assert_eq!(input.len(), 8); - debug_assert_eq!(output.len(), 2); + debug_assert_eq!(output.len(), 4); - // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at - // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the + // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at + // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): - // b0 c1 c0 r1 r0 p2 p1 p0 + // b1 b0 c1 c0 r1 r0 p2 p1 p0 // // The initially bitsliced data groups first by bit position, then row, column, block: - // p2 p1 p0 r1 r0 c1 c0 b0 + // p2 p1 p0 r1 r0 c1 c0 b1 b0 let mut t0 = input[0]; let mut t1 = input[1]; @@ -1257,47 +1297,61 @@ fn inv_bitslice(input: &mut [u32], output: &mut [Block]) { // TODO: these bit index swaps are identical to those in 'packing' - // Bit Index Swap 5 <-> 0: - // __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0 - let m0 = 0x55555555; + // Bit Index Swap 6 <-> 0: + // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0 + let m0 = 0x5555555555555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); - // Bit Index Swap 6 <-> 1: - // __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __ - let m1 = 0x33333333; + // Bit Index Swap 7 <-> 1: + // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __ + let m1 = 0x3333333333333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); - // Bit Index Swap 7 <-> 2: - // p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __ - let m2 = 0x0f0f0f0f; + // Bit Index Swap 8 <-> 2: + // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __ + let m2 = 0x0f0f0f0f0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); - // De-interleave the columns on output (note the order of output) - // c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __ - output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes()); - output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes()); - output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes()); - output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes()); - output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes()); - output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes()); - output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes()); - output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes()); + #[rustfmt::skip] + fn write_reordered(columns: u64, output: &mut [u8]) { + output[0x0] = (columns ) as u8; + output[0x1] = (columns >> 0x10) as u8; + output[0x2] = (columns >> 0x20) as u8; + output[0x3] = (columns >> 0x30) as u8; + output[0x8] = (columns >> 0x08) as u8; + output[0x9] = (columns >> 0x18) as u8; + output[0xa] = (columns >> 0x28) as u8; + output[0xb] = (columns >> 0x38) as u8; + } + + // Reorder by relabeling (note the order of output) + // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __ + // Reorder each block's bytes on output + // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __ + write_reordered(t0, &mut output[0][0x00..0x0c]); + write_reordered(t4, &mut output[0][0x04..0x10]); + write_reordered(t1, &mut output[1][0x00..0x0c]); + write_reordered(t5, &mut output[1][0x04..0x10]); + write_reordered(t2, &mut output[2][0x00..0x0c]); + write_reordered(t6, &mut output[2][0x04..0x10]); + write_reordered(t3, &mut output[3][0x00..0x0c]); + write_reordered(t7, &mut output[3][0x04..0x10]); // Final AES bit index, as desired: - // b0 c1 c0 r1 r0 p2 p1 p0 + // b1 b0 c1 c0 r1 r0 p2 p1 p0 } /// Copy 32-bytes within the provided slice to an 8-byte offset -fn memshift32(buffer: &mut [u32], src_offset: usize) { +fn memshift32(buffer: &mut [u64], src_offset: usize) { debug_assert_eq!(src_offset % 8, 0); let dst_offset = src_offset + 8; @@ -1311,7 +1365,7 @@ fn memshift32(buffer: &mut [u32], src_offset: usize) { /// XOR the round key to the internal state. The round keys are expected to be /// pre-computed and to be packed in the fixsliced representation. #[inline] -fn add_round_key(state: &mut State, rkey: &[u32]) { +fn add_round_key(state: &mut State, rkey: &[u64]) { debug_assert_eq!(rkey.len(), 8); for (a, b) in state.iter_mut().zip(rkey) { *a ^= b; @@ -1319,56 +1373,56 @@ fn add_round_key(state: &mut State, rkey: &[u32]) { } #[inline(always)] -fn add_round_constant_bit(state: &mut [u32], bit: usize) { - state[bit] ^= 0x0000c000; +fn add_round_constant_bit(state: &mut [u64], bit: usize) { + state[bit] ^= 0x00000000f0000000; } #[inline(always)] -fn ror(x: u32, y: u32) -> u32 { +fn ror(x: u64, y: u32) -> u64 { x.rotate_right(y) } #[inline(always)] fn ror_distance(rows: u32, cols: u32) -> u32 { - (rows << 3) + (cols << 1) + (rows << 4) + (cols << 2) } #[inline(always)] -fn rotate_rows_1(x: u32) -> u32 { +fn rotate_rows_1(x: u64) -> u64 { ror(x, ror_distance(1, 0)) } #[inline(always)] -fn rotate_rows_2(x: u32) -> u32 { +fn rotate_rows_2(x: u64) -> u64 { ror(x, ror_distance(2, 0)) } #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_1(x: u32) -> u32 { - (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) | - (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0) +fn rotate_rows_and_columns_1_1(x: u64) -> u64 { + (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) | + (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000) } #[cfg(not(feature = "compact"))] #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_2(x: u32) -> u32 { - (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) | - (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0) +fn rotate_rows_and_columns_1_2(x: u64) -> u64 { + (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) | + (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00) } #[cfg(not(feature = "compact"))] #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_3(x: u32) -> u32 { - (ror(x, ror_distance(1, 3)) & 0x03030303) | - (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc) +fn rotate_rows_and_columns_1_3(x: u64) -> u64 { + (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) | + (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0) } #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_2_2(x: u32) -> u32 { - (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) | - (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0) +fn rotate_rows_and_columns_2_2(x: u64) -> u64 { + (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) | + (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00) }