Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-add SSE2 with soft fallback for non salsa20/20 variants #348

Merged
merged 12 commits into from
Apr 29, 2024
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions salsa20/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ keywords = ["crypto", "stream-cipher", "trait", "xsalsa20"]
categories = ["cryptography", "no-std"]

[dependencies]
cfg-if = "1"
cipher = "=0.5.0-pre.4"

[dev-dependencies]
Expand Down
20 changes: 20 additions & 0 deletions salsa20/src/backends.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use cfg_if::cfg_if;

cfg_if! {
if #[cfg(salsa20_force_soft)] {
pub(crate) mod soft;
} else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if! {
if #[cfg(salsa20_force_sse2)] {
pub(crate) mod sse2;
} else if #[cfg(salsa20_force_soft)] {
pub(crate) mod soft;
} else {
pub(crate) mod sse2;
pub(crate) mod soft;
}
}
} else {
pub(crate) mod soft;
}
}
70 changes: 70 additions & 0 deletions salsa20/src/backends/soft.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
//! Portable implementation which does not rely on architecture-specific
//! intrinsics.

use crate::{Block, SalsaCore, Unsigned, STATE_WORDS};
use cipher::{
consts::{U1, U64},
BlockSizeUser, ParBlocksSizeUser, StreamBackend, StreamCipherSeekCore,
};

pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>);

impl<'a, R: Unsigned> BlockSizeUser for Backend<'a, R> {
type BlockSize = U64;
}

impl<'a, R: Unsigned> ParBlocksSizeUser for Backend<'a, R> {
type ParBlocksSize = U1;
}

impl<'a, R: Unsigned> StreamBackend for Backend<'a, R> {
#[inline(always)]
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
let res = run_rounds::<R>(&self.0.state);

self.0.set_block_pos(self.0.get_block_pos() + 1);

for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) {
chunk.copy_from_slice(&val.to_le_bytes());
}
}
}

#[inline]
#[allow(clippy::many_single_char_names)]
pub(crate) fn quarter_round(
a: usize,
b: usize,
c: usize,
d: usize,
state: &mut [u32; STATE_WORDS],
) {
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18);
}

#[inline(always)]
fn run_rounds<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] {
let mut res = *state;

for _ in 0..R::USIZE {
// column rounds
quarter_round(0, 4, 8, 12, &mut res);
quarter_round(5, 9, 13, 1, &mut res);
quarter_round(10, 14, 2, 6, &mut res);
quarter_round(15, 3, 7, 11, &mut res);

// diagonal rounds
quarter_round(0, 1, 2, 3, &mut res);
quarter_round(5, 6, 7, 4, &mut res);
quarter_round(10, 11, 8, 9, &mut res);
quarter_round(15, 12, 13, 14, &mut res);
}

for (s1, s0) in res.iter_mut().zip(state.iter()) {
*s1 = s1.wrapping_add(*s0);
}
res
}
166 changes: 166 additions & 0 deletions salsa20/src/backends/sse2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
use crate::{
backends::soft::Backend as SoftBackend,
Block, SalsaCore, StreamClosure, Unsigned, STATE_WORDS};
use cipher::{
consts::{U1, U64},
BlockSizeUser, ParBlocksSizeUser, StreamBackend,
};
use core::marker::PhantomData;

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[inline]
#[target_feature(enable = "sse2")]
pub(crate) unsafe fn inner<R, F>(state: &mut [u32; STATE_WORDS], f: F)
where
R: Unsigned,
F: StreamClosure<BlockSize = U64>,
{
let state_ptr = state.as_ptr() as *const __m128i;
let mut backend = Backend::<R> {
v: [
_mm_loadu_si128(state_ptr.add(0)),
_mm_loadu_si128(state_ptr.add(1)),
_mm_loadu_si128(state_ptr.add(2)),
_mm_loadu_si128(state_ptr.add(3)),
],
_pd: PhantomData,
};

// The SSE2 backend only works for Salsa20/20. Any other variant will fallback to the soft backend.
if R::USIZE == 10 {
f.call(&mut backend);
state[8] = _mm_cvtsi128_si32(backend.v[2]) as u32;
}
else {
f.call(&mut SoftBackend(&mut SalsaCore::<R> {
state: *state,
rounds: PhantomData,
}));
}
}

struct Backend<R: Unsigned> {
v: [__m128i; 4],
_pd: PhantomData<R>,
}

impl<R: Unsigned> BlockSizeUser for Backend<R> {
type BlockSize = U64;
}

impl<R: Unsigned> ParBlocksSizeUser for Backend<R> {
type ParBlocksSize = U1;
}

impl<R: Unsigned> StreamBackend for Backend<R> {
#[inline(always)]
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
unsafe {
let res = rounds::<R>(&self.v);

self.v[2] = _mm_add_epi32(self.v[2], _mm_set_epi32(0, 0, 0, 1));
let block_ptr = block.as_mut_ptr() as *mut __m128i;

for (i, v) in res.iter().enumerate() {
_mm_storeu_si128(block_ptr.add(i), *v);
}
}
}
}

#[inline]
#[target_feature(enable = "sse2")]
unsafe fn rounds<R: Unsigned>(v: &[__m128i; 4]) -> [__m128i; 4] {
let mut res = *v;

for _ in 0..R::USIZE {
double_round(&mut res);
}

for i in 0..4 {
res[i] = _mm_add_epi32(res[i], v[i]);
}

transpose(&mut res);
res[1] = _mm_shuffle_epi32(res[1], 0b_10_01_00_11);
res[2] = _mm_shuffle_epi32(res[2], 0b_01_00_11_10);
res[3] = _mm_shuffle_epi32(res[3], 0b_00_11_10_01);
transpose(&mut res);

res
}

/// The Salsa20 doubleround function for SSE2.
///
/// https://users.rust-lang.org/t/can-the-compiler-infer-sse-instructions/59976
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn double_round([a, b, c, d]: &mut [__m128i; 4]) {
let mut t_sum: __m128i;
let mut t_rotl: __m128i;

// Operate on "columns"
t_sum = _mm_add_epi32(*a, *d);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25));
*b = _mm_xor_si128(*b, t_rotl);

t_sum = _mm_add_epi32(*b, *a);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23));
*c = _mm_xor_si128(*c, t_rotl);

t_sum = _mm_add_epi32(*c, *b);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19));
*d = _mm_xor_si128(*d, t_rotl);

t_sum = _mm_add_epi32(*d, *c);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14));
*a = _mm_xor_si128(*a, t_rotl);

// Rearrange data.
*b = _mm_shuffle_epi32(*b, 0b_10_01_00_11);
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10);
*d = _mm_shuffle_epi32(*d, 0b_00_11_10_01);

// Operate on "rows".
t_sum = _mm_add_epi32(*a, *b);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25));
*d = _mm_xor_si128(*d, t_rotl);

t_sum = _mm_add_epi32(*d, *a);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23));
*c = _mm_xor_si128(*c, t_rotl);

t_sum = _mm_add_epi32(*c, *d);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19));
*b = _mm_xor_si128(*b, t_rotl);

t_sum = _mm_add_epi32(*b, *c);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14));
*a = _mm_xor_si128(*a, t_rotl);

// Rearrange data.
*b = _mm_shuffle_epi32(*b, 0b_00_11_10_01);
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10);
*d = _mm_shuffle_epi32(*d, 0b_10_01_00_11);
}

/// Transpose an integer 4 by 4 matrix in SSE2.
///
/// https://randombit.net/bitbashing/posts/integer_matrix_transpose_in_sse2.html
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn transpose([a, b, c, d]: &mut [__m128i; 4]) {
let t0 = _mm_unpacklo_epi32(*a, *b);
let t1 = _mm_unpacklo_epi32(*c, *d);
let t2 = _mm_unpackhi_epi32(*a, *b);
let t3 = _mm_unpackhi_epi32(*c, *d);

*a = _mm_unpacklo_epi64(t0, t1);
*b = _mm_unpackhi_epi64(t0, t1);
*c = _mm_unpacklo_epi64(t2, t3);
*d = _mm_unpackhi_epi64(t2, t3);
}
Loading
Loading