diff --git a/Cargo.lock b/Cargo.lock index a4b5a204..df6d98b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,7 @@ dependencies = [ "cpufeatures", "hex-literal", "password-hash", + "rayon", "zeroize", ] diff --git a/argon2/Cargo.toml b/argon2/Cargo.toml index 3db8bd91..625a8a58 100644 --- a/argon2/Cargo.toml +++ b/argon2/Cargo.toml @@ -21,6 +21,7 @@ base64ct = "1" blake2 = { version = "=0.11.0-pre.5", default-features = false } # optional dependencies +rayon = { version = "1.7", optional = true } password-hash = { version = "0.6.0-rc.0", optional = true } zeroize = { version = "1", default-features = false, optional = true } @@ -36,6 +37,7 @@ default = ["alloc", "password-hash", "rand"] alloc = ["password-hash?/alloc"] std = ["alloc", "password-hash?/os_rng", "base64ct/std"] +parallel = ["dep:rayon", "std"] rand = ["password-hash?/rand_core"] simple = ["password-hash"] zeroize = ["dep:zeroize"] diff --git a/argon2/src/lib.rs b/argon2/src/lib.rs index 67e30351..a5c2ac10 100644 --- a/argon2/src/lib.rs +++ b/argon2/src/lib.rs @@ -153,6 +153,7 @@ mod algorithm; mod blake2b_long; mod block; mod error; +mod memory; mod params; mod version; @@ -174,6 +175,7 @@ pub use { use crate::blake2b_long::blake2b_long; use blake2::{Blake2b512, Digest, digest}; use core::fmt; +use memory::Memory; #[cfg(all(feature = "alloc", feature = "password-hash"))] use password_hash::{Decimal, Ident, ParamsString, Salt}; @@ -349,7 +351,7 @@ impl<'key> Argon2<'key> { mut initial_hash: digest::Output, ) -> Result<()> { let block_count = self.params.block_count(); - let memory_blocks = memory_blocks + let mut memory_blocks = memory_blocks .get_mut(..block_count) .ok_or(Error::MemoryTooLittle)?; @@ -383,31 +385,59 @@ impl<'key> Argon2<'key> { // Run passes on blocks for pass in 0..iterations { - for slice in 0..SYNC_POINTS { + memory_blocks.for_each_segment(lanes, |mut memory_view, slice, lane| { let data_independent_addressing = self.algorithm == Algorithm::Argon2i || (self.algorithm == Algorithm::Argon2id && pass == 0 && slice < SYNC_POINTS / 2); - for lane in 0..lanes { - let mut address_block = Block::default(); - let mut input_block = Block::default(); - let zero_block = Block::default(); + let mut address_block = Block::default(); + let mut input_block = Block::default(); + let zero_block = Block::default(); + + if data_independent_addressing { + input_block.as_mut()[..6].copy_from_slice(&[ + pass as u64, + lane as u64, + slice as u64, + block_count as u64, + iterations as u64, + self.algorithm as u64, + ]); + } + let first_block = if pass == 0 && slice == 0 { if data_independent_addressing { - input_block.as_mut()[..6].copy_from_slice(&[ - pass as u64, - lane as u64, - slice as u64, - memory_blocks.len() as u64, - iterations as u64, - self.algorithm as u64, - ]); + // Generate first set of addresses + self.update_address_block( + &mut address_block, + &mut input_block, + &zero_block, + ); } - let first_block = if pass == 0 && slice == 0 { - if data_independent_addressing { - // Generate first set of addresses + // The first two blocks of each lane are already initialized + 2 + } else { + 0 + }; + + let mut cur_index = lane * lane_length + slice * segment_length + first_block; + let mut prev_index = if slice == 0 && first_block == 0 { + // Last block in current lane + cur_index + lane_length - 1 + } else { + // Previous block + cur_index - 1 + }; + + // Fill blocks in the segment + for block in first_block..segment_length { + // Extract entropy + let rand = if data_independent_addressing { + let addres_index = block % ADDRESSES_IN_BLOCK; + + if addres_index == 0 { self.update_address_block( &mut address_block, &mut input_block, @@ -415,101 +445,73 @@ impl<'key> Argon2<'key> { ); } - // The first two blocks of each lane are already initialized - 2 + address_block.as_ref()[addres_index] } else { - 0 + memory_view.get_block(prev_index).as_ref()[0] }; - let mut cur_index = lane * lane_length + slice * segment_length + first_block; - let mut prev_index = if slice == 0 && first_block == 0 { - // Last block in current lane - cur_index + lane_length - 1 + // Calculate source block index for compress function + let ref_lane = if pass == 0 && slice == 0 { + // Cannot reference other lanes yet + lane } else { - // Previous block - cur_index - 1 + (rand >> 32) as usize % lanes }; - // Fill blocks in the segment - for block in first_block..segment_length { - // Extract entropy - let rand = if data_independent_addressing { - let addres_index = block % ADDRESSES_IN_BLOCK; - - if addres_index == 0 { - self.update_address_block( - &mut address_block, - &mut input_block, - &zero_block, - ); - } - - address_block.as_ref()[addres_index] - } else { - memory_blocks[prev_index].as_ref()[0] - }; - - // Calculate source block index for compress function - let ref_lane = if pass == 0 && slice == 0 { - // Cannot reference other lanes yet - lane - } else { - (rand >> 32) as usize % lanes - }; - - let reference_area_size = if pass == 0 { - // First pass - if slice == 0 { - // First slice - block - 1 // all but the previous - } else if ref_lane == lane { - // The same lane => add current segment - slice * segment_length + block - 1 - } else { - slice * segment_length - if block == 0 { 1 } else { 0 } - } + let reference_area_size = if pass == 0 { + // First pass + if slice == 0 { + // First slice + block - 1 // all but the previous + } else if ref_lane == lane { + // The same lane => add current segment + slice * segment_length + block - 1 } else { - // Second pass - if ref_lane == lane { - lane_length - segment_length + block - 1 - } else { - lane_length - segment_length - if block == 0 { 1 } else { 0 } - } - }; - - // 1.2.4. Mapping rand to 0.. and produce - // relative position - let mut map = rand & 0xFFFFFFFF; - map = (map * map) >> 32; - let relative_position = reference_area_size - - 1 - - ((reference_area_size as u64 * map) >> 32) as usize; - - // 1.2.5 Computing starting position - let start_position = if pass != 0 && slice != SYNC_POINTS - 1 { - (slice + 1) * segment_length + slice * segment_length - if block == 0 { 1 } else { 0 } + } + } else { + // Second pass + if ref_lane == lane { + lane_length - segment_length + block - 1 } else { - 0 - }; + lane_length - segment_length - if block == 0 { 1 } else { 0 } + } + }; - let lane_index = (start_position + relative_position) % lane_length; - let ref_index = ref_lane * lane_length + lane_index; + // 1.2.4. Mapping rand to 0.. and produce + // relative position + let mut map = rand & 0xFFFFFFFF; + map = (map * map) >> 32; + let relative_position = reference_area_size + - 1 + - ((reference_area_size as u64 * map) >> 32) as usize; + + // 1.2.5 Computing starting position + let start_position = if pass != 0 && slice != SYNC_POINTS - 1 { + (slice + 1) * segment_length + } else { + 0 + }; - // Calculate new block - let result = - self.compress(&memory_blocks[prev_index], &memory_blocks[ref_index]); + let lane_index = (start_position + relative_position) % lane_length; + let ref_index = ref_lane * lane_length + lane_index; - if self.version == Version::V0x10 || pass == 0 { - memory_blocks[cur_index] = result; - } else { - memory_blocks[cur_index] ^= &result; - }; + // Calculate new block + let result = self.compress( + memory_view.get_block(prev_index), + memory_view.get_block(ref_index), + ); - prev_index = cur_index; - cur_index += 1; - } + if self.version == Version::V0x10 || pass == 0 { + *memory_view.get_block_mut(cur_index) = result; + } else { + *memory_view.get_block_mut(cur_index) ^= &result; + }; + + prev_index = cur_index; + cur_index += 1; } - } + }); } Ok(()) diff --git a/argon2/src/memory.rs b/argon2/src/memory.rs new file mode 100644 index 00000000..3802d9c0 --- /dev/null +++ b/argon2/src/memory.rs @@ -0,0 +1,162 @@ +//! Views into Argon2 memory that can be processed in parallel. +//! +//! This module implements, with a combination of compile-time borrowing and runtime checking, the +//! cooperative contract described in section 3.4 (Indexing) of RFC 9106: +//! +//! > To enable parallel block computation, we further partition the memory matrix into SL = 4 +//! > vertical slices. The intersection of a slice and a lane is called a segment, which has a +//! > length of q/SL. Segments of the same slice can be computed in parallel and do not reference +//! > blocks from each other. All other blocks can be referenced. + +#![warn( + clippy::undocumented_unsafe_blocks, + clippy::missing_safety_doc, + unsafe_op_in_unsafe_fn +)] + +use core::marker::PhantomData; +use core::ptr::NonNull; + +#[cfg(feature = "parallel")] +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +use crate::{Block, SYNC_POINTS}; + +/// Extension trait for Argon2 memory blocks. +pub(crate) trait Memory<'a> { + /// Compute each Argon2 segment. + /// + /// By default computation is single threaded. Parallel computation can be enabled with the + /// `parallel` feature, in which case [rayon] is used to compute as many lanes in parallel as + /// possible. + fn for_each_segment(&mut self, lanes: usize, f: F) + where + F: Fn(SegmentView<'_>, usize, usize) + Sync + Send; +} + +impl Memory<'_> for &mut [Block] { + #[cfg(not(feature = "parallel"))] + fn for_each_segment(&mut self, lanes: usize, f: F) + where + F: Fn(SegmentView<'_>, usize, usize) + Sync + Send, + { + let inner = MemoryInner::new(self, lanes); + for slice in 0..SYNC_POINTS { + for lane in 0..lanes { + // SAFETY: `self` exclusively borrows the blocks, and we sequentially process + // slices and segments. + let segment = unsafe { SegmentView::new(inner, slice, lane) }; + f(segment, slice, lane); + } + } + } + + #[cfg(feature = "parallel")] + fn for_each_segment(&mut self, lanes: usize, f: F) + where + F: Fn(SegmentView<'_>, usize, usize) + Sync + Send, + { + let inner = MemoryInner::new(self, lanes); + for slice in 0..SYNC_POINTS { + (0..lanes).into_par_iter().for_each(|lane| { + // SAFETY: `self` exclusively borrows the blocks, we sequentially process slices, + // and we create exactly one segment view per lane in a slice. + let segment = unsafe { SegmentView::new(inner, slice, lane) }; + f(segment, slice, lane); + }); + } + } +} + +/// Low-level pointer and metadata for an Argon2 memory region. +#[derive(Clone, Copy)] +struct MemoryInner<'a> { + blocks: NonNull, + block_count: usize, + lane_length: usize, + phantom: PhantomData<&'a mut Block>, +} + +impl MemoryInner<'_> { + fn new(memory_blocks: &mut [Block], lanes: usize) -> Self { + let block_count = memory_blocks.len(); + let lane_length = block_count / lanes; + + // SAFETY: the pointer needs to be derived from a mutable reference because (later) + // mutating the blocks through a pointer derived from a shared reference would be UB. + let blocks = NonNull::from(memory_blocks); + + MemoryInner { + blocks: blocks.cast(), + block_count, + lane_length, + phantom: PhantomData, + } + } + + fn lane_of(&self, index: usize) -> usize { + index / self.lane_length + } + + fn slice_of(&self, index: usize) -> usize { + index / (self.lane_length / SYNC_POINTS) % SYNC_POINTS + } +} + +// SAFETY: private type, and just a pointer with some metadata. +unsafe impl Send for MemoryInner<'_> {} + +// SAFETY: private type, and just a pointer with some metadata. +unsafe impl Sync for MemoryInner<'_> {} + +/// A view into Argon2 memory for a particular segment (i.e. slice × lane). +pub(crate) struct SegmentView<'a> { + inner: MemoryInner<'a>, + slice: usize, + lane: usize, +} + +impl<'a> SegmentView<'a> { + /// Create a view into Argon2 memory for a particular segment (i.e. slice × lane). + /// + /// # Safety + /// + /// At any time, there can be at most one view for a given Argon2 segment. Additionally, all + /// concurrent segment views must be for the same slice. + unsafe fn new(inner: MemoryInner<'a>, slice: usize, lane: usize) -> Self { + SegmentView { inner, slice, lane } + } + + /// Get a shared reference to a block. + /// + /// # Panics + /// + /// Panics if the index is out of bounds or if the desired block *could* be mutably aliased (if + /// it is on the current slice but on a different lane/segment). + pub fn get_block(&self, index: usize) -> &Block { + assert!(index < self.inner.block_count); + assert!(self.inner.lane_of(index) == self.lane || self.inner.slice_of(index) != self.slice); + + // SAFETY: by construction, the base pointer is valid for reads, and we assert that the + // index is in bounds. We also assert that the index either lies on this lane, or is on + // another slice. Finally, we're the only view into this segment, and mutating through it + // requires `&mut self` and is restricted to blocks within the segment. + unsafe { self.inner.blocks.add(index).as_ref() } + } + + /// Get a mutable reference to a block. + /// + /// # Panics + /// + /// Panics if the index is out of bounds or if the desired block lies outside this segment. + pub fn get_block_mut(&mut self, index: usize) -> &mut Block { + assert!(index < self.inner.block_count); + assert_eq!(self.inner.lane_of(index), self.lane); + assert_eq!(self.inner.slice_of(index), self.slice); + + // SAFETY: by construction, the base pointer is valid for reads and writes, and we assert + // that the index is in bounds. We also assert that the index lies on this segment, and + // we're the only view for it, taking `&mut self`. + unsafe { self.inner.blocks.add(index).as_mut() } + } +} diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 6109e6af..491ce0ea 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -8,10 +8,17 @@ publish = false [dev-dependencies] argon2 = { path = "../argon2" } -criterion = { version = "0.4", features = ["html_reports"] } +criterion = { version = "0.5", features = ["html_reports"] } pprof = { version = "0.14", features = ["flamegraph", "criterion"] } +[features] +default = [] +parallel = ["argon2/parallel"] + [[bench]] name = "argon2" path = "src/argon2.rs" harness = false + +[patch.crates-io] +password-hash = { git = "https://github.com/RustCrypto/traits.git" } diff --git a/benches/src/argon2.rs b/benches/src/argon2.rs index b26b6371..b939c5d1 100644 --- a/benches/src/argon2.rs +++ b/benches/src/argon2.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeSet; + use argon2::*; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use pprof::criterion::{Output, PProfProfiler}; @@ -26,46 +28,26 @@ fn bench_default_params(c: &mut Criterion) { } } -fn bench_vary_m(c: &mut Criterion) { - let t_cost = 4; - let p_cost = 4; - for m_cost in [2 * 1024, 16 * 1024, 64 * 1024, 256 * 1024] { - let test_name = format!("argon2id V0x13 m={m_cost} t={t_cost} p={p_cost}"); - c.bench_function(&test_name, |b| { - let mut out = [0u8; 32]; - let params = Params::new(m_cost, t_cost, p_cost, Some(32)).unwrap(); - let argon2 = Argon2::new(Algorithm::Argon2id, Version::V0x13, params); - b.iter(|| { - argon2 - .hash_password_into(black_box(BENCH_PASSWORD), black_box(BENCH_SALT), &mut out) - .unwrap() - }) - }); +fn bench_vary_params(c: &mut Criterion) { + let mut tests = BTreeSet::new(); + // Vary `m_cost`. + for m_cost in [2 * 1024, 16 * 1024, 32 * 1024, 64 * 1024, 256 * 1024] { + tests.insert((m_cost, 4, 4)); } -} - -fn bench_vary_t(c: &mut Criterion) { - let m_cost = 32 * 1024; - let p_cost = 4; - for t_cost in [2, 8, 16, 24] { - let test_name = format!("argon2id V0x13 m={m_cost} t={t_cost} p={p_cost}"); - c.bench_function(&test_name, |b| { - let mut out = [0u8; 32]; - let params = Params::new(m_cost, t_cost, p_cost, Some(32)).unwrap(); - let argon2 = Argon2::new(Algorithm::Argon2id, Version::V0x13, params); - b.iter(|| { - argon2 - .hash_password_into(black_box(BENCH_PASSWORD), black_box(BENCH_SALT), &mut out) - .unwrap() - }) - }); + // Vary `t_cost`. + for t_cost in [1, 2, 4, 8, 16] { + tests.insert((32 * 1024, t_cost, 4)); } -} - -fn bench_vary_p(c: &mut Criterion) { - let m_cost = 32 * 1024; - let t_cost = 4; - for p_cost in [2, 8, 16, 64] { + // Vary `p_cost`. + for p_cost in [1, 2, 4, 8, 16] { + for m_mib in [256 * 1024, 1024 * 1024] { + tests.insert((m_mib, 1, p_cost)); + } + for t_cost in [1, 2, 4] { + tests.insert((32 * 1024, t_cost, p_cost)); + } + } + for (m_cost, t_cost, p_cost) in tests { let test_name = format!("argon2id V0x13 m={m_cost} t={t_cost} p={p_cost}"); c.bench_function(&test_name, |b| { let mut out = [0u8; 32]; @@ -85,8 +67,6 @@ criterion_group!( config = Criterion::default().with_profiler(PProfProfiler::new(300, Output::Flamegraph(None))); targets = bench_default_params, - bench_vary_m, - bench_vary_t, - bench_vary_p, + bench_vary_params, ); criterion_main!(benches);