diff --git a/salsa20/benches/mod.rs b/salsa20/benches/mod.rs index f07b023a..a680c449 100644 --- a/salsa20/benches/mod.rs +++ b/salsa20/benches/mod.rs @@ -1,9 +1,15 @@ #![feature(test)] + +use cipher::{ + Array, + consts::{U4, U64}, +}; extern crate test; cipher::stream_cipher_bench!( salsa20::Salsa8; salsa8_bench1_16b 16; + salsa8_bench1_64b 64; salsa8_bench2_256b 256; salsa8_bench3_1kib 1024; salsa8_bench4_16kib 16384; @@ -12,6 +18,7 @@ cipher::stream_cipher_bench!( cipher::stream_cipher_bench!( salsa20::Salsa12; salsa12_bench1_16b 16; + salsa12_bench1_64b 64; salsa12_bench2_256b 256; salsa12_bench3_1kib 1024; salsa12_bench4_16kib 16384; @@ -20,7 +27,50 @@ cipher::stream_cipher_bench!( cipher::stream_cipher_bench!( salsa20::Salsa20; salsa20_bench1_16b 16; + salsa20_bench1_64b 64; salsa20_bench2_256b 256; salsa20_bench3_1kib 1024; salsa20_bench4_16kib 16384; ); + +#[bench] +fn salsa8_bench1_chaining_altn(b: &mut test::Bencher) { + use salsa20::SalsaChaining; + use std::hash::{BuildHasher, Hasher}; + + let seed = std::hash::RandomState::new().build_hasher().finish(); + + let mut buf = [0u32; 16]; + buf[0] = seed as u32; + buf[1] = (seed >> 32) as u32; + + b.iter(|| { + let mut cipher = salsa20::SalsaCore::::from_raw_state_cv(buf); + cipher.write_keystream_block_cv(&mut buf); + test::black_box(&buf); + }); + + b.bytes = buf.len() as u64 * core::mem::size_of::() as u64; +} + +#[bench] +fn salsa8_bench1_chaining(b: &mut test::Bencher) { + use cipher::StreamCipherCore; + use std::hash::{BuildHasher, Hasher}; + + let seed = std::hash::RandomState::new().build_hasher().finish(); + + let mut buf = [0u32; 16]; + buf[0] = seed as u32; + buf[1] = (seed >> 32) as u32; + + b.iter(|| { + let mut cipher = salsa20::SalsaCore::::from_raw_state(buf); + cipher.write_keystream_block(unsafe { + core::mem::transmute::<&mut [u32; 16], &mut Array>(&mut buf) + }); + test::black_box(&buf); + }); + + b.bytes = buf.len() as u64 * core::mem::size_of::() as u64; +} diff --git a/salsa20/src/backends.rs b/salsa20/src/backends.rs index fbc9393c..4623d655 100644 --- a/salsa20/src/backends.rs +++ b/salsa20/src/backends.rs @@ -1 +1,30 @@ -pub(crate) mod soft; +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(all(target_feature = "sse2", any(target_arch = "x86", target_arch = "x86_64")))] { + pub(crate) mod sse2; + pub(crate) type Backend<'a, R> = sse2::Backend<'a, R>; + } else { + pub(crate) mod soft; + pub(crate) type Backend<'a, R> = soft::Backend<'a, R>; + } +} + +#[inline] +#[allow(clippy::many_single_char_names)] +pub(crate) fn quarter_round( + a: usize, + b: usize, + c: usize, + d: usize, + state: &mut [u32; crate::STATE_WORDS], +) { + let a = crate::DATA_LAYOUT_INVERSE[a]; + let b = crate::DATA_LAYOUT_INVERSE[b]; + let c = crate::DATA_LAYOUT_INVERSE[c]; + let d = crate::DATA_LAYOUT_INVERSE[d]; + state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); + state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); + state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); + state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); +} diff --git a/salsa20/src/backends/soft.rs b/salsa20/src/backends/soft.rs index caf2693f..7207020f 100644 --- a/salsa20/src/backends/soft.rs +++ b/salsa20/src/backends/soft.rs @@ -7,8 +7,16 @@ use cipher::{ consts::{U1, U64}, }; +use super::quarter_round; + pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore); +impl<'a, R: Unsigned> From<&'a mut SalsaCore> for Backend<'a, R> { + fn from(core: &'a mut SalsaCore) -> Self { + Backend(core) + } +} + impl BlockSizeUser for Backend<'_, R> { type BlockSize = U64; } @@ -17,6 +25,17 @@ impl ParBlocksSizeUser for Backend<'_, R> { type ParBlocksSize = U1; } +impl Backend<'_, R> { + #[inline(always)] + pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) { + let res = run_rounds::(&self.0.state); + + self.0.set_block_pos(self.0.get_block_pos() + 1); + + block.copy_from_slice(&res); + } +} + impl StreamCipherBackend for Backend<'_, R> { #[inline(always)] fn gen_ks_block(&mut self, block: &mut Block) { @@ -24,27 +43,13 @@ impl StreamCipherBackend for Backend<'_, R> { self.0.set_block_pos(self.0.get_block_pos() + 1); - for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) { - chunk.copy_from_slice(&val.to_le_bytes()); + for i in 0..16 { + block[i * 4..(i + 1) * 4] + .copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes()); } } } -#[inline] -#[allow(clippy::many_single_char_names)] -pub(crate) fn quarter_round( - a: usize, - b: usize, - c: usize, - d: usize, - state: &mut [u32; STATE_WORDS], -) { - state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); - state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); - state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); - state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); -} - #[inline(always)] fn run_rounds(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] { let mut res = *state; diff --git a/salsa20/src/backends/sse2.rs b/salsa20/src/backends/sse2.rs new file mode 100644 index 00000000..0a682a51 --- /dev/null +++ b/salsa20/src/backends/sse2.rs @@ -0,0 +1,112 @@ +//! SSE2 backend for Salsa20. + +use crate::{Block, STATE_WORDS, SalsaCore, Unsigned}; +use cipher::{ + Array, BlockSizeUser, ParBlocksSizeUser, StreamCipherBackend, StreamCipherSeekCore, + consts::{U1, U64}, +}; + +pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore); + +impl<'a, R: Unsigned> From<&'a mut SalsaCore> for Backend<'a, R> { + fn from(core: &'a mut SalsaCore) -> Self { + Backend(core) + } +} + +impl BlockSizeUser for Backend<'_, R> { + type BlockSize = U64; +} + +impl ParBlocksSizeUser for Backend<'_, R> { + type ParBlocksSize = U1; +} + +impl Backend<'_, R> { + #[inline(always)] + pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) { + unsafe { run_rounds_sse2_ptr::(block.as_mut_ptr().cast(), &self.0.state) }; + + self.0.set_block_pos(self.0.get_block_pos() + 1); + } +} + +impl StreamCipherBackend for Backend<'_, R> { + #[inline(always)] + fn gen_ks_block(&mut self, block: &mut Block) { + let mut res = [0u32; STATE_WORDS]; + unsafe { run_rounds_sse2_ptr::(res.as_mut_ptr().cast(), &self.0.state) }; + + self.0.set_block_pos(self.0.get_block_pos() + 1); + + for i in 0..16 { + block[i * 4..(i + 1) * 4] + .copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes()); + } + } +} + +#[inline(always)] +/// Run the Salsa20 rounds using SSE2 instructions. +/// +/// Input: state in internal order +/// Output: out in internal order, does not have to be aligned on any boundary +unsafe fn run_rounds_sse2_ptr(out: *mut Array, state: &[u32; STATE_WORDS]) { + use core::arch::x86_64::*; + unsafe { + let [a_save, b_save, d_save, c_save] = [ + _mm_loadu_si128(state.as_ptr().add(0).cast()), + _mm_loadu_si128(state.as_ptr().add(4).cast()), + _mm_loadu_si128(state.as_ptr().add(8).cast()), + _mm_loadu_si128(state.as_ptr().add(12).cast()), + ]; + let [mut a, mut b, mut c, mut d] = [a_save, b_save, c_save, d_save]; + + macro_rules! mm_rol_epi32x { + ($w:expr, $amt:literal) => {{ + let w = $w; + _mm_xor_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt)) + }}; + } + + macro_rules! quarter_xmmwords { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + $b = _mm_xor_si128($b, mm_rol_epi32x!(_mm_add_epi32($a, $d), 7)); + $c = _mm_xor_si128($c, mm_rol_epi32x!(_mm_add_epi32($b, $a), 9)); + $d = _mm_xor_si128($d, mm_rol_epi32x!(_mm_add_epi32($c, $b), 13)); + $a = _mm_xor_si128($a, mm_rol_epi32x!(_mm_add_epi32($d, $c), 18)); + }; + } + + for _ in 0..R::USIZE { + quarter_xmmwords!(a, b, c, d); + + // a stays in place + // b = left shuffle d by 1 element + d = _mm_shuffle_epi32(d, 0b00_11_10_01); + // c = left shuffle c by 2 elements + c = _mm_shuffle_epi32(c, 0b01_00_11_10); + // d = left shuffle b by 3 elements + b = _mm_shuffle_epi32(b, 0b10_01_00_11); + + (b, d) = (d, b); + + quarter_xmmwords!(a, b, c, d); + + // a stays in place + // b = left shuffle d by 1 element + d = _mm_shuffle_epi32(d, 0b00_11_10_01); + // c = left shuffle c by 2 elements + c = _mm_shuffle_epi32(c, 0b01_00_11_10); + // d = left shuffle b by 3 elements + b = _mm_shuffle_epi32(b, 0b10_01_00_11); + + (b, d) = (d, b); + } + + _mm_storeu_si128(out.byte_add(0).cast(), _mm_add_epi32(a, a_save)); + _mm_storeu_si128(out.byte_add(16).cast(), _mm_add_epi32(b, b_save)); + _mm_storeu_si128(out.byte_add(32).cast(), _mm_add_epi32(d, d_save)); + _mm_storeu_si128(out.byte_add(48).cast(), _mm_add_epi32(c, c_save)); + } +} diff --git a/salsa20/src/lib.rs b/salsa20/src/lib.rs index a5407c73..9e0469ba 100644 --- a/salsa20/src/lib.rs +++ b/salsa20/src/lib.rs @@ -118,7 +118,27 @@ const STATE_WORDS: usize = 16; /// State initialization constant ("expand 32-byte k") const CONSTANTS: [u32; 4] = [0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]; +const DATA_LAYOUT: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 12, 1, 6, 11, 8, 13, 2, 7]; + +const DATA_LAYOUT_INVERSE: [usize; 16] = { + let mut index = [0; 16]; + let mut i = 0; + while i < 16 { + let mut inverse = 0; + while inverse < 16 { + if DATA_LAYOUT[inverse] == i { + index[i] = inverse; + break; + } + inverse += 1; + } + i += 1; + } + index +}; + /// The Salsa20 core function. +#[repr(transparent)] pub struct SalsaCore { /// Internal state of the core function state: [u32; STATE_WORDS], @@ -126,17 +146,91 @@ pub struct SalsaCore { rounds: PhantomData, } +#[expect(unused)] +const STATIC_ASSERT_CORE_IS_64_BYTES: [(); size_of::>()] = [(); 64]; + +/// Salsa20 chaining operations. +pub trait SalsaChaining: BlockSizeUser { + /// Permutation table for shuffling the natural order state into the internal order. + const ALTN_DATA_LAYOUT: [usize; STATE_WORDS]; + + /// Inverse permutation table. + const INVERSE_ALTN_DATA_LAYOUT: [usize; STATE_WORDS] = { + let mut index = [0; 16]; + let mut i = 0; + while i < 16 { + let mut inverse = 0; + while inverse < 16 { + if Self::ALTN_DATA_LAYOUT[inverse] == i { + index[i] = inverse; + break; + } + inverse += 1; + } + i += 1; + } + index + }; + + /// Shuffle the state into the internal data layout. + fn shuffle_state_into_altn(state: &mut [u32; STATE_WORDS]) { + let mut new_state = [0u32; STATE_WORDS]; + for i in 0..STATE_WORDS { + new_state[i] = state[Self::ALTN_DATA_LAYOUT[i]]; + } + state.copy_from_slice(&new_state); + } + + /// Shuffle the state from the internal data layout. + fn shuffle_state_from_altn(state: &mut [u32; STATE_WORDS]) { + let mut new_state = [0u32; STATE_WORDS]; + for i in 0..STATE_WORDS { + new_state[i] = state[Self::INVERSE_ALTN_DATA_LAYOUT[i]]; + } + state.copy_from_slice(&new_state); + } + + /// Instantiate new Salsa core from raw state in internal order. + fn from_raw_state_cv(state: [u32; STATE_WORDS]) -> Self; + + /// Generate keystream block in internal order. + fn write_keystream_block_cv(&mut self, block: &mut [u32; STATE_WORDS]); +} + impl SalsaCore { /// Create new Salsa core from raw state. /// /// This method is mainly intended for the `scrypt` crate. /// Other users generally should not use this method. pub fn from_raw_state(state: [u32; STATE_WORDS]) -> Self { + Self { + state: core::array::from_fn(|i| state[DATA_LAYOUT[i]]), + rounds: PhantomData, + } + } +} + +impl SalsaChaining for SalsaCore { + const ALTN_DATA_LAYOUT: [usize; STATE_WORDS] = DATA_LAYOUT; + + /// Create new Salsa core from raw state with alternative data layout. + /// + /// This method is mainly intended for the `scrypt` crate. + /// Other users generally should not use this method. + fn from_raw_state_cv(state: [u32; STATE_WORDS]) -> Self { Self { state, rounds: PhantomData, } } + + /// Generate keystream block with alternative data layout. + /// + /// This method is used to generate keystream blocks with alternative data layout. + fn write_keystream_block_cv(&mut self, block: &mut [u32; STATE_WORDS]) { + let mut backend = backends::Backend::<'_, R>::from(self); + backend.gen_ks_block_altn(block); + } } impl KeySizeUser for SalsaCore { @@ -177,7 +271,7 @@ impl KeyIvInit for SalsaCore { state[15] = CONSTANTS[3]; Self { - state, + state: core::array::from_fn(|i| state[DATA_LAYOUT[i]]), rounds: PhantomData, } } @@ -190,7 +284,7 @@ impl StreamCipherCore for SalsaCore { rem.try_into().ok() } fn process_with_backend(&mut self, f: impl StreamCipherClosure) { - f.call(&mut backends::soft::Backend(self)); + f.call(&mut backends::Backend::<'_, R>::from(self)); } } @@ -199,13 +293,14 @@ impl StreamCipherSeekCore for SalsaCore { #[inline(always)] fn get_block_pos(&self) -> u64 { - (self.state[8] as u64) + ((self.state[9] as u64) << 32) + (self.state[DATA_LAYOUT_INVERSE[8]] as u64) + + ((self.state[DATA_LAYOUT_INVERSE[9]] as u64) << 32) } #[inline(always)] fn set_block_pos(&mut self, pos: u64) { - self.state[8] = (pos & 0xffff_ffff) as u32; - self.state[9] = ((pos >> 32) & 0xffff_ffff) as u32; + self.state[DATA_LAYOUT_INVERSE[8]] = (pos & 0xffff_ffff) as u32; + self.state[DATA_LAYOUT_INVERSE[9]] = ((pos >> 32) & 0xffff_ffff) as u32; } } diff --git a/salsa20/src/xsalsa.rs b/salsa20/src/xsalsa.rs index 9f84cef5..aa30349b 100644 --- a/salsa20/src/xsalsa.rs +++ b/salsa20/src/xsalsa.rs @@ -8,7 +8,7 @@ use cipher::{ consts::{U4, U6, U10, U16, U24, U32, U64}, }; -use crate::backends::soft::quarter_round; +use crate::backends::quarter_round; #[cfg(feature = "zeroize")] use cipher::zeroize::ZeroizeOnDrop; @@ -90,28 +90,32 @@ impl ZeroizeOnDrop for XSalsaCore {} /// /// It produces 256-bits of output suitable for use as a Salsa20 key pub fn hsalsa(key: &Key, input: &Array) -> Array { + const KEY_IDX: [usize; 8] = [0, 5, 10, 15, 6, 7, 8, 9]; + #[inline(always)] fn to_u32(chunk: &[u8]) -> u32 { u32::from_le_bytes(chunk.try_into().unwrap()) } - let mut state = [0u32; 16]; - state[0] = CONSTANTS[0]; - state[1..5] + let mut t = [0u32; 16]; + t[0] = CONSTANTS[0]; + t[1..5] .iter_mut() .zip(key[0..16].chunks_exact(4)) .for_each(|(v, chunk)| *v = to_u32(chunk)); - state[5] = CONSTANTS[1]; - state[6..10] + t[5] = CONSTANTS[1]; + t[6..10] .iter_mut() .zip(input.chunks_exact(4)) .for_each(|(v, chunk)| *v = to_u32(chunk)); - state[10] = CONSTANTS[2]; - state[11..15] + t[10] = CONSTANTS[2]; + t[11..15] .iter_mut() .zip(key[16..].chunks_exact(4)) .for_each(|(v, chunk)| *v = to_u32(chunk)); - state[15] = CONSTANTS[3]; + t[15] = CONSTANTS[3]; + + let mut state = core::array::from_fn(|i| t[crate::DATA_LAYOUT[i]]); // 20 rounds consisting of 10 column rounds and 10 diagonal rounds for _ in 0..R::USIZE { @@ -129,10 +133,10 @@ pub fn hsalsa(key: &Key, input: &Array) -> Array } let mut output = Array::default(); - let key_idx: [usize; 8] = [0, 5, 10, 15, 6, 7, 8, 9]; - for (i, chunk) in output.chunks_exact_mut(4).enumerate() { - chunk.copy_from_slice(&state[key_idx[i]].to_le_bytes()); + for i in 0..8 { + output[i * 4..(i + 1) * 4] + .copy_from_slice(&state[crate::DATA_LAYOUT_INVERSE[KEY_IDX[i]]].to_le_bytes()); } output diff --git a/salsa20/tests/mod.rs b/salsa20/tests/mod.rs index a859d3dc..26e772f2 100644 --- a/salsa20/tests/mod.rs +++ b/salsa20/tests/mod.rs @@ -1,9 +1,11 @@ //! Salsa20 tests +use cipher::consts::U10; use cipher::{KeyIvInit, StreamCipher, StreamCipherSeek}; use hex_literal::hex; -use salsa20::Salsa20; +use salsa20::SalsaCore; use salsa20::XSalsa20; +use salsa20::{Salsa20, SalsaChaining}; cipher::stream_cipher_seek_test!(salsa20_seek, Salsa20); cipher::stream_cipher_seek_test!(xsalsa20_seek, XSalsa20); @@ -176,6 +178,26 @@ fn xsalsa20_encrypt_hello_world() { assert_eq!(buf, EXPECTED_XSALSA20_HELLO_WORLD); } +#[test] +fn salsa20_alternate_data_layout_shuffle() { + use cipher::StreamCipherCore; + let state: [u32; 16] = core::array::from_fn(|i| i as u32); + let mut altn_state = state; + SalsaCore::::shuffle_state_into_altn(&mut altn_state); + + let mut state_core = SalsaCore::::from_raw_state(state); + let mut altn_state_core = SalsaCore::::from_raw_state_cv(altn_state); + let mut b1 = Default::default(); + let mut b2 = Default::default(); + state_core.write_keystream_block(&mut b1); + altn_state_core.write_keystream_block(&mut b2); + assert_eq!(b1, b2); + + SalsaCore::::shuffle_state_from_altn(&mut altn_state); + + assert_eq!(state, altn_state); +} + // Regression test for https://github.com/RustCrypto/stream-ciphers/issues/445 #[test] fn salsa20_big_offset() {