diff --git a/.github/workflows/aes.yml b/.github/workflows/aes.yml index abb51da6..16453c92 100644 --- a/.github/workflows/aes.yml +++ b/.github/workflows/aes.yml @@ -39,7 +39,8 @@ jobs: - run: cargo build --release --target ${{ matrix.target }} - run: cargo build --release --target ${{ matrix.target }} --features compact - run: cargo build --release --target ${{ matrix.target }} --features ctr - - run: cargo build --release --target ${{ matrix.target }} --features compact,ctr + - run: cargo build --release --target ${{ matrix.target }} --features force-soft + - run: cargo build --release --target ${{ matrix.target }} --all-features # Tests for the portable software backend soft: @@ -73,6 +74,7 @@ jobs: - run: cargo test --release --target ${{ matrix.target }} - run: cargo test --release --target ${{ matrix.target }} --features compact - run: cargo test --release --target ${{ matrix.target }} --features ctr + - run: cargo test --release --target ${{ matrix.target }} --features force-soft - run: cargo test --release --target ${{ matrix.target }} --all-features # Tests for the AES-NI backend @@ -111,6 +113,7 @@ jobs: - run: cargo test --release --target ${{ matrix.target }} - run: cargo test --release --target ${{ matrix.target }} --features compact - run: cargo test --release --target ${{ matrix.target }} --features ctr + - run: cargo test --release --target ${{ matrix.target }} --features force-soft - run: cargo test --release --target ${{ matrix.target }} --all-features # Cross-compiled tests @@ -144,4 +147,5 @@ jobs: - run: cross test --release --target ${{ matrix.target }} - run: cross test --release --target ${{ matrix.target }} --features compact - run: cross test --release --target ${{ matrix.target }} --features ctr - - run: cross test --release --target ${{ matrix.target }} --features compact,ctr + - run: cross test --release --target ${{ matrix.target }} --features force-soft + - run: cross test --release --target ${{ matrix.target }} --all-features diff --git a/Cargo.lock b/Cargo.lock index a21b2e7c..80983beb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,6 +6,7 @@ version = "0.7.0-pre" dependencies = [ "cfg-if", "cipher", + "cpuid-bool", "ctr", "opaque-debug", ] @@ -73,6 +74,12 @@ dependencies = [ "generic-array", ] +[[package]] +name = "cpuid-bool" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb25d077389e53838a8158c8e99174c5a9d902dee4904320db714f3c653ffba" + [[package]] name = "ctr" version = "0.7.0-pre" diff --git a/aes/Cargo.toml b/aes/Cargo.toml index eda40a21..fa876265 100644 --- a/aes/Cargo.toml +++ b/aes/Cargo.toml @@ -23,9 +23,13 @@ opaque-debug = "0.3" [dev-dependencies] cipher = { version = "=0.3.0-pre", features = ["dev"] } +[target.'cfg(any(target_arch = "x86_64", target_arch = "x86"))'.dependencies] +cpuid-bool = "0.2" + [features] -compact = [] # Reduce code size at the cost of performance +compact = [] # Reduce code size at the cost of slower performance +force-soft = [] # Disable support for AES hardware intrinsics [package.metadata.docs.rs] -all-features = true +features = ["ctr"] rustdoc-args = ["--cfg", "docsrs"] diff --git a/aes/src/autodetect.rs b/aes/src/autodetect.rs new file mode 100644 index 00000000..2e15b7fd --- /dev/null +++ b/aes/src/autodetect.rs @@ -0,0 +1,201 @@ +//! Autodetection support for hardware accelerated AES backends with fallback +//! to the fixsliced "soft" implementation. + +use crate::{Block, ParBlocks}; +use cipher::{ + consts::{U16, U24, U32, U8}, + generic_array::GenericArray, + BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher, +}; + +cpuid_bool::new!(aes_cpuid, "aes"); + +macro_rules! define_aes_impl { + ( + $name:tt, + $module:tt, + $key_size:ty, + $doc:expr + ) => { + #[doc=$doc] + #[derive(Clone)] + pub struct $name { + inner: $module::Inner, + token: aes_cpuid::InitToken + } + + mod $module { + #[derive(Copy, Clone)] + pub(super) union Inner { + pub(super) ni: crate::ni::$name, + pub(super) soft: crate::soft::$name, + } + } + + impl NewBlockCipher for $name { + type KeySize = $key_size; + + #[inline] + fn new(key: &GenericArray) -> Self { + let (token, aesni_present) = aes_cpuid::init_get(); + + let inner = if aesni_present { + $module::Inner { ni: crate::ni::$name::new(key) } + } else { + $module::Inner { soft: crate::soft::$name::new(key) } + }; + + Self { inner, token } + } + } + + impl BlockCipher for $name { + type BlockSize = U16; + type ParBlocks = U8; + } + + impl BlockEncrypt for $name { + #[inline] + fn encrypt_block(&self, block: &mut Block) { + if self.token.get() { + unsafe { self.inner.ni.encrypt_block(block) } + } else { + unsafe { self.inner.soft.encrypt_block(block) } + } + } + + #[inline] + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + if self.token.get() { + unsafe { self.inner.ni.encrypt_par_blocks(blocks) } + } else { + unsafe { self.inner.soft.encrypt_par_blocks(blocks) } + } + } + } + + impl BlockDecrypt for $name { + #[inline] + fn decrypt_block(&self, block: &mut Block) { + if self.token.get() { + unsafe { self.inner.ni.decrypt_block(block) } + } else { + unsafe { self.inner.soft.decrypt_block(block) } + } + } + + #[inline] + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + if self.token.get() { + unsafe { self.inner.ni.decrypt_par_blocks(blocks) } + } else { + unsafe { self.inner.soft.decrypt_par_blocks(blocks) } + } + } + } + + opaque_debug::implement!($name); + } +} + +define_aes_impl!(Aes128, aes128, U16, "AES-128 block cipher instance"); +define_aes_impl!(Aes192, aes192, U24, "AES-192 block cipher instance"); +define_aes_impl!(Aes256, aes256, U32, "AES-256 block cipher instance"); + +#[cfg(feature = "ctr")] +pub(crate) mod ctr { + use super::{aes_cpuid, Aes128, Aes192, Aes256}; + use cipher::{ + block::BlockCipher, + generic_array::GenericArray, + stream::{ + FromBlockCipher, LoopError, OverflowError, SeekNum, SyncStreamCipher, + SyncStreamCipherSeek, + }, + }; + + macro_rules! define_aes_ctr_impl { + ( + $name:tt, + $cipher:ident, + $module:tt, + $doc:expr + ) => { + #[doc=$doc] + #[cfg_attr(docsrs, doc(cfg(feature = "ctr")))] + pub struct $name { + inner: $module::Inner, + } + + mod $module { + #[allow(clippy::large_enum_variant)] + pub(super) enum Inner { + Ni(crate::ni::$name), + Soft(crate::soft::$name), + } + } + + impl FromBlockCipher for $name { + type BlockCipher = $cipher; + type NonceSize = <$cipher as BlockCipher>::BlockSize; + + fn from_block_cipher( + cipher: $cipher, + nonce: &GenericArray, + ) -> Self { + let inner = if aes_cpuid::get() { + $module::Inner::Ni( + crate::ni::$name::from_block_cipher( + unsafe { cipher.inner.ni }, + nonce + ) + ) + } else { + $module::Inner::Soft( + crate::soft::$name::from_block_cipher( + unsafe { cipher.inner.soft }, + nonce + ) + ) + }; + + Self { inner } + } + } + + impl SyncStreamCipher for $name { + #[inline] + fn try_apply_keystream(&mut self, data: &mut [u8]) -> Result<(), LoopError> { + match &mut self.inner { + $module::Inner::Ni(aes) => aes.try_apply_keystream(data), + $module::Inner::Soft(aes) => aes.try_apply_keystream(data) + } + } + } + + impl SyncStreamCipherSeek for $name { + #[inline] + fn try_current_pos(&self) -> Result { + match &self.inner { + $module::Inner::Ni(aes) => aes.try_current_pos(), + $module::Inner::Soft(aes) => aes.try_current_pos() + } + } + + #[inline] + fn try_seek(&mut self, pos: T) -> Result<(), LoopError> { + match &mut self.inner { + $module::Inner::Ni(aes) => aes.try_seek(pos), + $module::Inner::Soft(aes) => aes.try_seek(pos) + } + } + } + + opaque_debug::implement!($name); + } + } + + define_aes_ctr_impl!(Aes128Ctr, Aes128, aes128ctr, "AES-128 in CTR mode"); + define_aes_ctr_impl!(Aes192Ctr, Aes192, aes192ctr, "AES-192 in CTR mode"); + define_aes_ctr_impl!(Aes256Ctr, Aes256, aes256ctr, "AES-256 in CTR mode"); +} diff --git a/aes/src/lib.rs b/aes/src/lib.rs index a5147d97..68a951d6 100644 --- a/aes/src/lib.rs +++ b/aes/src/lib.rs @@ -63,21 +63,17 @@ use cfg_if::cfg_if; cfg_if! { if #[cfg(all( - target_feature = "aes", - target_feature = "sse2", any(target_arch = "x86_64", target_arch = "x86"), + not(feature = "force-soft") ))] { + mod autodetect; mod ni; - pub use ni::{Aes128, Aes192, Aes256}; + mod soft; + + pub use autodetect::{Aes128, Aes192, Aes256}; #[cfg(feature = "ctr")] - cfg_if! { - if #[cfg(target_feature = "ssse3")] { - pub use ni::{Aes128Ctr, Aes192Ctr, Aes256Ctr}; - } else { - compile_error!("Please enable the +ssse3 target feature to use `ctr` with AES-NI") - } - } + pub use autodetect::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr}; } else { mod soft; pub use soft::{Aes128, Aes192, Aes256}; @@ -87,7 +83,7 @@ cfg_if! { } } -pub use cipher::{self, BlockCipher, NewBlockCipher}; +pub use cipher::{self, BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher}; /// 128-bit AES block pub type Block = cipher::generic_array::GenericArray; diff --git a/aes/src/ni/aes128.rs b/aes/src/ni/aes128.rs index 6f37cd53..f6f3fcfe 100644 --- a/aes/src/ni/aes128.rs +++ b/aes/src/ni/aes128.rs @@ -112,6 +112,7 @@ impl BlockDecrypt for Aes128 { // Safety: `loadu` and `storeu` support unaligned access #[allow(clippy::cast_ptr_alignment)] let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + b = _mm_xor_si128(b, keys[10]); b = _mm_aesdec_si128(b, keys[9]); b = _mm_aesdec_si128(b, keys[8]); @@ -123,6 +124,9 @@ impl BlockDecrypt for Aes128 { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); } diff --git a/aes/src/ni/aes192.rs b/aes/src/ni/aes192.rs index 9cd4c345..4e3ac724 100644 --- a/aes/src/ni/aes192.rs +++ b/aes/src/ni/aes192.rs @@ -114,6 +114,7 @@ impl BlockDecrypt for Aes192 { // Safety: `loadu` and `storeu` support unaligned access #[allow(clippy::cast_ptr_alignment)] let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + b = _mm_xor_si128(b, keys[12]); b = _mm_aesdec_si128(b, keys[11]); b = _mm_aesdec_si128(b, keys[10]); @@ -127,6 +128,9 @@ impl BlockDecrypt for Aes192 { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); } diff --git a/aes/src/ni/aes256.rs b/aes/src/ni/aes256.rs index 17d47a46..4e1a70fc 100644 --- a/aes/src/ni/aes256.rs +++ b/aes/src/ni/aes256.rs @@ -118,6 +118,7 @@ impl BlockDecrypt for Aes256 { // Safety: `loadu` and `storeu` support unaligned access #[allow(clippy::cast_ptr_alignment)] let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + b = _mm_xor_si128(b, keys[14]); b = _mm_aesdec_si128(b, keys[13]); b = _mm_aesdec_si128(b, keys[12]); @@ -133,6 +134,9 @@ impl BlockDecrypt for Aes256 { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); } diff --git a/aes/src/soft/fixslice64.rs b/aes/src/soft/fixslice64.rs index f4e57fc7..802081e4 100644 --- a/aes/src/soft/fixslice64.rs +++ b/aes/src/soft/fixslice64.rs @@ -20,27 +20,28 @@ use cipher::{ consts::{U16, U24, U32}, generic_array::GenericArray, }; +use core::convert::TryInto; /// AES block batch size for this implementation -pub(crate) const FIXSLICE_BLOCKS: usize = 4; +pub(crate) const FIXSLICE_BLOCKS: usize = 2; /// AES-128 round keys -pub(crate) type FixsliceKeys128 = [u64; 88]; +pub(crate) type FixsliceKeys128 = [u32; 88]; /// AES-192 round keys -pub(crate) type FixsliceKeys192 = [u64; 104]; +pub(crate) type FixsliceKeys192 = [u32; 104]; /// AES-256 round keys -pub(crate) type FixsliceKeys256 = [u64; 120]; +pub(crate) type FixsliceKeys256 = [u32; 120]; -/// 512-bit internal state -type State = [u64; 8]; +/// 256-bit internal state +type State = [u32; 8]; /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. pub(crate) fn aes128_key_schedule(key: &GenericArray) -> FixsliceKeys128 { - let mut rkeys = [0u64; 88]; + let mut rkeys = [0u32; 88]; - bitslice(&mut rkeys[..8], key, key, key, key); + bitslice(&mut rkeys[..8], key, key); let mut rk_off = 0; for rcon in 0..10 { @@ -89,25 +90,19 @@ pub(crate) fn aes128_key_schedule(key: &GenericArray) -> FixsliceKeys12 /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys192 { - let mut rkeys = [0u64; 104]; - let mut tmp = [0u64; 8]; - - bitslice( - &mut rkeys[..8], - &key[..16], - &key[..16], - &key[..16], - &key[..16], - ); - bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]); + let mut rkeys = [0u32; 104]; + let mut tmp = [0u32; 8]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut tmp, &key[8..], &key[8..]); let mut rcon = 0; let mut rk_off = 8; loop { for i in 0..8 { - rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8)) - | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); + rkeys[rk_off + i] = + (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); } sub_bytes(&mut tmp); @@ -118,8 +113,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let mut ti = rkeys[rk_off + i]; - ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1)); - ti ^= 0xf000f000f000f000 & (ti << 4); + ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1)); + ti ^= 0xc0c0c0c0 & (ti << 2); tmp[i] = ti; } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); @@ -127,13 +122,10 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let ui = tmp[i]; - let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) - | (0xff00ff00ff00ff00 & (ui << 8)); - ti ^= 0x000f000f000f000f & (ui >> 12); - tmp[i] = ti - ^ (0xfff0fff0fff0fff0 & (ti << 4)) - ^ (0xff00ff00ff00ff00 & (ti << 8)) - ^ (0xf000f000f000f000 & (ti << 12)); + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4)); + ti ^= 0x03030303 & (ui >> 6); + tmp[i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); rk_off += 8; @@ -145,13 +137,11 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 rcon += 1; for i in 0..8 { - let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) - | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); - ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3)); - rkeys[rk_off + i] = ti - ^ (0xfff0fff0fff0fff0 & (ti << 4)) - ^ (0xff00ff00ff00ff00 & (ti << 8)) - ^ (0xf000f000f000f000 & (ti << 12)); + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) + | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); + ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3)); + rkeys[rk_off + i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); } rk_off += 8; @@ -162,8 +152,8 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 for i in 0..8 { let ui = rkeys[(rk_off - 8) + i]; let mut ti = rkeys[(rk_off - 16) + i]; - ti ^= 0x0f000f000f000f00 & (ui >> 4); - ti ^= 0xf000f000f000f000 & (ti << 4); + ti ^= 0x30303030 & (ui >> 2); + ti ^= 0xc0c0c0c0 & (ti << 2); tmp[i] = ti; } } @@ -194,22 +184,10 @@ pub(crate) fn aes192_key_schedule(key: &GenericArray) -> FixsliceKeys19 /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. pub(crate) fn aes256_key_schedule(key: &GenericArray) -> FixsliceKeys256 { - let mut rkeys = [0u64; 120]; - - bitslice( - &mut rkeys[..8], - &key[..16], - &key[..16], - &key[..16], - &key[..16], - ); - bitslice( - &mut rkeys[8..16], - &key[16..], - &key[16..], - &key[16..], - &key[16..], - ); + let mut rkeys = [0u32; 120]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut rkeys[8..16], &key[16..], &key[16..]); let mut rk_off = 8; @@ -270,7 +248,7 @@ pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[80..]); inv_sub_bytes(&mut state); @@ -327,7 +305,7 @@ pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); @@ -384,7 +362,7 @@ pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[96..]); inv_sub_bytes(&mut state); @@ -435,7 +413,7 @@ pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); @@ -486,7 +464,7 @@ pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[112..]); inv_sub_bytes(&mut state); @@ -543,7 +521,7 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); let mut state = State::default(); - bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); @@ -593,9 +571,9 @@ pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { inv_bitslice(&mut state, blocks); } -/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true +/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true /// inverse of 'sub_bytes'. -fn inv_sub_bytes(state: &mut [u64]) { +fn inv_sub_bytes(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler @@ -803,8 +781,8 @@ fn inv_sub_bytes(state: &mut [u64]) { /// /// See: /// -/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule. -fn sub_bytes(state: &mut [u64]) { +/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule. +fn sub_bytes(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler @@ -977,12 +955,12 @@ fn sub_bytes(state: &mut [u64]) { /// NOT operations that are omitted in S-box #[inline] -fn sub_bytes_nots(state: &mut [u64]) { +fn sub_bytes_nots(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); - state[0] ^= 0xffffffffffffffff; - state[1] ^= 0xffffffffffffffff; - state[5] ^= 0xffffffffffffffff; - state[6] ^= 0xffffffffffffffff; + state[0] ^= 0xffffffff; + state[1] ^= 0xffffffff; + state[5] ^= 0xffffffff; + state[6] ^= 0xffffffff; } /// Computation of the MixColumns transformation in the fixsliced representation, with different @@ -1119,13 +1097,13 @@ define_mix_columns!( ); #[inline] -fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) { +fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) { let t = (*a ^ ((*a) >> shift)) & mask; *a ^= t ^ (t << shift); } #[inline] -fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { +fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) { let t = (*a ^ ((*b) >> shift)) & mask; *a ^= t; *b ^= t << shift; @@ -1134,46 +1112,46 @@ fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { /// Applies ShiftRows once on an AES state (or key). #[cfg(not(feature = "compact"))] #[inline] -fn shift_rows_1(state: &mut [u64]) { +fn shift_rows_1(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 8, 0x00f000ff000f0000); - delta_swap_1(x, 4, 0x0f0f00000f0f0000); + delta_swap_1(x, 4, 0x0c0f0300); + delta_swap_1(x, 2, 0x33003300); } } /// Applies ShiftRows twice on an AES state (or key). #[inline] -fn shift_rows_2(state: &mut [u64]) { +fn shift_rows_2(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 8, 0x00ff000000ff0000); + delta_swap_1(x, 4, 0x0f000f00); } } /// Applies ShiftRows three times on an AES state (or key). #[inline] -fn shift_rows_3(state: &mut [u64]) { +fn shift_rows_3(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { - delta_swap_1(x, 8, 0x000f00ff00f00000); - delta_swap_1(x, 4, 0x0f0f00000f0f0000); + delta_swap_1(x, 4, 0x030f0c00); + delta_swap_1(x, 2, 0x33003300); } } #[inline(always)] -fn inv_shift_rows_1(state: &mut [u64]) { +fn inv_shift_rows_1(state: &mut [u32]) { shift_rows_3(state); } #[inline(always)] -fn inv_shift_rows_2(state: &mut [u64]) { +fn inv_shift_rows_2(state: &mut [u32]) { shift_rows_2(state); } #[cfg(not(feature = "compact"))] #[inline(always)] -fn inv_shift_rows_3(state: &mut [u64]) { +fn inv_shift_rows_3(state: &mut [u32]) { shift_rows_1(state); } @@ -1185,84 +1163,66 @@ fn inv_shift_rows_3(state: &mut [u64]) { /// /// The `idx_ror` parameter refers to the rotation value, which varies between the /// different key schedules. -fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) { +fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) { for i in 0..8 { let off_i = offset + i; - let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror)); - rkeys[off_i] = rk - ^ (0xfff0fff0fff0fff0 & (rk << 4)) - ^ (0xff00ff00ff00ff00 & (rk << 8)) - ^ (0xf000f000f000f000 & (rk << 12)); + let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror)); + rkeys[off_i] = + rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6)); } } -/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state. -fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) { +/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state. +fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) { debug_assert_eq!(output.len(), 8); debug_assert_eq!(input0.len(), 16); debug_assert_eq!(input1.len(), 16); - debug_assert_eq!(input2.len(), 16); - debug_assert_eq!(input3.len(), 16); - // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a - // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an + // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): - // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // b0 c1 c0 r1 r0 p2 p1 p0 // // The desired bitsliced data groups first by bit position, then row, column, block: - // p2 p1 p0 r1 r0 c1 c0 b1 b0 - - #[rustfmt::skip] - fn read_reordered(input: &[u8]) -> u64 { - (u64::from(input[0x0]) ) | - (u64::from(input[0x1]) << 0x10) | - (u64::from(input[0x2]) << 0x20) | - (u64::from(input[0x3]) << 0x30) | - (u64::from(input[0x8]) << 0x08) | - (u64::from(input[0x9]) << 0x18) | - (u64::from(input[0xa]) << 0x28) | - (u64::from(input[0xb]) << 0x38) - } - - // Reorder each block's bytes on input - // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __ - // Reorder by relabeling (note the order of input) - // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __ - let mut t0 = read_reordered(&input0[0x00..0x0c]); - let mut t4 = read_reordered(&input0[0x04..0x10]); - let mut t1 = read_reordered(&input1[0x00..0x0c]); - let mut t5 = read_reordered(&input1[0x04..0x10]); - let mut t2 = read_reordered(&input2[0x00..0x0c]); - let mut t6 = read_reordered(&input2[0x04..0x10]); - let mut t3 = read_reordered(&input3[0x00..0x0c]); - let mut t7 = read_reordered(&input3[0x04..0x10]); - - // Bit Index Swap 6 <-> 0: - // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0 - let m0 = 0x5555555555555555; + // p2 p1 p0 r1 r0 c1 c0 b0 + + // Interleave the columns on input (note the order of input) + // b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __ + let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap()); + let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap()); + let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap()); + let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap()); + let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap()); + let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap()); + let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap()); + let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap()); + + // Bit Index Swap 5 <-> 0: + // __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0 + let m0 = 0x55555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); - // Bit Index Swap 7 <-> 1: - // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __ - let m1 = 0x3333333333333333; + // Bit Index Swap 6 <-> 1: + // __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __ + let m1 = 0x33333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); - // Bit Index Swap 8 <-> 2: - // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __ - let m2 = 0x0f0f0f0f0f0f0f0f; + // Bit Index Swap 7 <-> 2: + // c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __ + let m2 = 0x0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); // Final bitsliced bit index, as desired: - // p2 p1 p0 r1 r0 c1 c0 b1 b0 + // p2 p1 p0 r1 r0 c1 c0 b0 output[0] = t0; output[1] = t1; output[2] = t2; @@ -1273,18 +1233,18 @@ fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], inp output[7] = t7; } -/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output. -fn inv_bitslice(input: &mut [u64], output: &mut [Block]) { +/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output. +fn inv_bitslice(input: &mut [u32], output: &mut [Block]) { debug_assert_eq!(input.len(), 8); - debug_assert_eq!(output.len(), 4); + debug_assert_eq!(output.len(), 2); - // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at - // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at + // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): - // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // b0 c1 c0 r1 r0 p2 p1 p0 // // The initially bitsliced data groups first by bit position, then row, column, block: - // p2 p1 p0 r1 r0 c1 c0 b1 b0 + // p2 p1 p0 r1 r0 c1 c0 b0 let mut t0 = input[0]; let mut t1 = input[1]; @@ -1297,61 +1257,47 @@ fn inv_bitslice(input: &mut [u64], output: &mut [Block]) { // TODO: these bit index swaps are identical to those in 'packing' - // Bit Index Swap 6 <-> 0: - // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0 - let m0 = 0x5555555555555555; + // Bit Index Swap 5 <-> 0: + // __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0 + let m0 = 0x55555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); - // Bit Index Swap 7 <-> 1: - // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __ - let m1 = 0x3333333333333333; + // Bit Index Swap 6 <-> 1: + // __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __ + let m1 = 0x33333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); - // Bit Index Swap 8 <-> 2: - // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __ - let m2 = 0x0f0f0f0f0f0f0f0f; + // Bit Index Swap 7 <-> 2: + // p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __ + let m2 = 0x0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); - #[rustfmt::skip] - fn write_reordered(columns: u64, output: &mut [u8]) { - output[0x0] = (columns ) as u8; - output[0x1] = (columns >> 0x10) as u8; - output[0x2] = (columns >> 0x20) as u8; - output[0x3] = (columns >> 0x30) as u8; - output[0x8] = (columns >> 0x08) as u8; - output[0x9] = (columns >> 0x18) as u8; - output[0xa] = (columns >> 0x28) as u8; - output[0xb] = (columns >> 0x38) as u8; - } - - // Reorder by relabeling (note the order of output) - // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __ - // Reorder each block's bytes on output - // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __ - write_reordered(t0, &mut output[0][0x00..0x0c]); - write_reordered(t4, &mut output[0][0x04..0x10]); - write_reordered(t1, &mut output[1][0x00..0x0c]); - write_reordered(t5, &mut output[1][0x04..0x10]); - write_reordered(t2, &mut output[2][0x00..0x0c]); - write_reordered(t6, &mut output[2][0x04..0x10]); - write_reordered(t3, &mut output[3][0x00..0x0c]); - write_reordered(t7, &mut output[3][0x04..0x10]); + // De-interleave the columns on output (note the order of output) + // c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __ + output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes()); + output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes()); + output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes()); + output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes()); + output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes()); + output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes()); + output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes()); + output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes()); // Final AES bit index, as desired: - // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // b0 c1 c0 r1 r0 p2 p1 p0 } /// Copy 32-bytes within the provided slice to an 8-byte offset -fn memshift32(buffer: &mut [u64], src_offset: usize) { +fn memshift32(buffer: &mut [u32], src_offset: usize) { debug_assert_eq!(src_offset % 8, 0); let dst_offset = src_offset + 8; @@ -1365,7 +1311,7 @@ fn memshift32(buffer: &mut [u64], src_offset: usize) { /// XOR the round key to the internal state. The round keys are expected to be /// pre-computed and to be packed in the fixsliced representation. #[inline] -fn add_round_key(state: &mut State, rkey: &[u64]) { +fn add_round_key(state: &mut State, rkey: &[u32]) { debug_assert_eq!(rkey.len(), 8); for (a, b) in state.iter_mut().zip(rkey) { *a ^= b; @@ -1373,56 +1319,56 @@ fn add_round_key(state: &mut State, rkey: &[u64]) { } #[inline(always)] -fn add_round_constant_bit(state: &mut [u64], bit: usize) { - state[bit] ^= 0x00000000f0000000; +fn add_round_constant_bit(state: &mut [u32], bit: usize) { + state[bit] ^= 0x0000c000; } #[inline(always)] -fn ror(x: u64, y: u32) -> u64 { +fn ror(x: u32, y: u32) -> u32 { x.rotate_right(y) } #[inline(always)] fn ror_distance(rows: u32, cols: u32) -> u32 { - (rows << 4) + (cols << 2) + (rows << 3) + (cols << 1) } #[inline(always)] -fn rotate_rows_1(x: u64) -> u64 { +fn rotate_rows_1(x: u32) -> u32 { ror(x, ror_distance(1, 0)) } #[inline(always)] -fn rotate_rows_2(x: u64) -> u64 { +fn rotate_rows_2(x: u32) -> u32 { ror(x, ror_distance(2, 0)) } #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_1(x: u64) -> u64 { - (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) | - (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000) +fn rotate_rows_and_columns_1_1(x: u32) -> u32 { + (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) | + (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0) } #[cfg(not(feature = "compact"))] #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_2(x: u64) -> u64 { - (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) | - (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00) +fn rotate_rows_and_columns_1_2(x: u32) -> u32 { + (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0) } #[cfg(not(feature = "compact"))] #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_1_3(x: u64) -> u64 { - (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) | - (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0) +fn rotate_rows_and_columns_1_3(x: u32) -> u32 { + (ror(x, ror_distance(1, 3)) & 0x03030303) | + (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc) } #[inline(always)] #[rustfmt::skip] -fn rotate_rows_and_columns_2_2(x: u64) -> u64 { - (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) | - (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00) +fn rotate_rows_and_columns_2_2(x: u32) -> u32 { + (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0) } diff --git a/aes/src/soft/impls.rs b/aes/src/soft/impls.rs index 5b76926c..31c2e931 100644 --- a/aes/src/soft/impls.rs +++ b/aes/src/soft/impls.rs @@ -20,7 +20,7 @@ macro_rules! define_aes_impl { $doc:expr ) => { #[doc=$doc] - #[derive(Clone)] + #[derive(Copy, Clone)] pub struct $name { keys: $fixslice_keys, }