From a0d7f3f45ec1e67813778dc2a73557eb74943a39 Mon Sep 17 00:00:00 2001 From: Taiki Endo Date: Fri, 22 Mar 2024 01:47:37 +0900 Subject: [PATCH] x86_64: Use `lock or` instead of mfence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 10-35% faster at least in simple cases on Coffee Lake. Below are the results of the microbenchmark on an Intel Core i7-9750H (Coffee Lake) with the ORDERING constant set to SeqCst. ``` bench_portable_atomic_arch/u128_store time: [11.610 ns 11.670 ns 11.738 ns] change: [-36.119% -35.236% -34.348%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) 4 (4.00%) high mild bench_portable_atomic_arch/u128_concurrent_load_store time: [202.30 µs 203.54 µs 205.24 µs] change: [-32.313% -31.167% -29.845%] (p = 0.00 < 0.05) Performance has improved. Found 8 outliers among 100 measurements (8.00%) 2 (2.00%) high mild 6 (6.00%) high severe bench_portable_atomic_arch/u128_concurrent_store time: [395.74 µs 397.37 µs 398.98 µs] change: [-18.517% -17.560% -16.582%] (p = 0.00 < 0.05) Performance has improved. Found 15 outliers among 100 measurements (15.00%) 1 (1.00%) low mild 2 (2.00%) high mild 12 (12.00%) high severe bench_portable_atomic_arch/u128_concurrent_store_swap time: [791.21 µs 793.43 µs 795.69 µs] change: [-10.682% -10.197% -9.6789%] (p = 0.00 < 0.05) Performance has improved. Found 7 outliers among 100 measurements (7.00%) 2 (2.00%) low severe 2 (2.00%) low mild 2 (2.00%) high mild 1 (1.00%) high severe ``` --- bench/benches/bench.rs | 23 +++++++++++++++++------ src/imp/atomic128/x86_64.rs | 25 +++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/bench/benches/bench.rs b/bench/benches/bench.rs index c5ed6b6f0..e8338e00c 100644 --- a/bench/benches/bench.rs +++ b/bench/benches/bench.rs @@ -51,6 +51,8 @@ mod spinlock_fallback; const THREADS: usize = 2; const N: u32 = 5000; +const ORDERING: Ordering = Ordering::AcqRel; +// const ORDERING: Ordering = Ordering::SeqCst; trait AtomicInt: Sized + Send + Sync { fn new(v: T) -> Self; @@ -69,24 +71,32 @@ macro_rules! impl_atomic { } #[inline] fn load_(&self) -> $int_type { - self.load(Ordering::Acquire) + self.load(if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING }) } #[inline] fn store_(&self, val: $int_type) { - self.store(val, Ordering::Release); + self.store( + val, + if ORDERING == Ordering::AcqRel { Ordering::Release } else { ORDERING }, + ); } #[inline] fn swap_(&self, val: $int_type) -> $int_type { - self.swap(val, Ordering::AcqRel) + self.swap(val, ORDERING) } #[inline] fn compare_exchange_(&self, old: $int_type, new: $int_type) -> $int_type { - self.compare_exchange(old, new, Ordering::AcqRel, Ordering::Acquire) - .unwrap_or_else(|x| x) + self.compare_exchange( + old, + new, + ORDERING, + if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING }, + ) + .unwrap_or_else(|x| x) } #[inline] fn fetch_add_(&self, val: $int_type) -> $int_type { - self.fetch_add(val, Ordering::AcqRel) + self.fetch_add(val, ORDERING) } } }; @@ -97,6 +107,7 @@ macro_rules! impl_atomic_no_order { impl AtomicInt<$int_type> for $atomic_type { #[inline] fn new(v: $int_type) -> Self { + assert_eq!(ORDERING, Ordering::AcqRel); Self::new(v) } #[inline] diff --git a/src/imp/atomic128/x86_64.rs b/src/imp/atomic128/x86_64.rs index 8d06a814c..2d8454b62 100644 --- a/src/imp/atomic128/x86_64.rs +++ b/src/imp/atomic128/x86_64.rs @@ -68,6 +68,23 @@ macro_rules! ptr_modifier { }; } +#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))] +#[cfg(target_feature = "sse")] +#[cfg(target_pointer_width = "32")] +macro_rules! sp { + () => { + "esp" + }; +} +#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))] +#[cfg(target_feature = "sse")] +#[cfg(target_pointer_width = "64")] +macro_rules! sp { + () => { + "rsp" + }; +} + // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction // requirements for the currently enabled target features. In the first place, // there is no option in the x86 assembly for such case, like ARM .arch_extension, @@ -185,10 +202,14 @@ unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) { Ordering::SeqCst => { asm!( concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"), - "mfence", + // Equivalent to mfence, but is 10-35% faster at least in simple cases on Coffee Lake (https://github.com/taiki-e/portable-atomic/pull/156). + // Based on x86 64-bit atomic SeqCst store using SSE generated by LLVM. + // https://godbolt.org/z/9sKEr8YWc + concat!("lock or dword ptr [", sp!(), "], 0"), dst = in(reg) dst, val = in(xmm_reg) val, - options(nostack, preserves_flags), + // Do not use `preserves_flags` because OR modifies the OF, CF, SF, ZF, and PF flags. + options(nostack), ); } _ => unreachable!(),