From a0d7f3f45ec1e67813778dc2a73557eb74943a39 Mon Sep 17 00:00:00 2001
From: Taiki Endo <te316e89@gmail.com>
Date: Fri, 22 Mar 2024 01:47:37 +0900
Subject: [PATCH] x86_64: Use `lock or` instead of mfence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

10-35% faster at least in simple cases on Coffee Lake.

Below are the results of the microbenchmark on an Intel Core i7-9750H
(Coffee Lake) with the ORDERING constant set to SeqCst.

```
bench_portable_atomic_arch/u128_store
                        time:   [11.610 ns 11.670 ns 11.738 ns]
                        change: [-36.119% -35.236% -34.348%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) high mild
bench_portable_atomic_arch/u128_concurrent_load_store
                        time:   [202.30 µs 203.54 µs 205.24 µs]
                        change: [-32.313% -31.167% -29.845%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
  2 (2.00%) high mild
  6 (6.00%) high severe
bench_portable_atomic_arch/u128_concurrent_store
                        time:   [395.74 µs 397.37 µs 398.98 µs]
                        change: [-18.517% -17.560% -16.582%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 15 outliers among 100 measurements (15.00%)
  1 (1.00%) low mild
  2 (2.00%) high mild
  12 (12.00%) high severe
bench_portable_atomic_arch/u128_concurrent_store_swap
                        time:   [791.21 µs 793.43 µs 795.69 µs]
                        change: [-10.682% -10.197% -9.6789%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
  2 (2.00%) low severe
  2 (2.00%) low mild
  2 (2.00%) high mild
  1 (1.00%) high severe
```
---
 bench/benches/bench.rs      | 23 +++++++++++++++++------
 src/imp/atomic128/x86_64.rs | 25 +++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/bench/benches/bench.rs b/bench/benches/bench.rs
index c5ed6b6f0..e8338e00c 100644
--- a/bench/benches/bench.rs
+++ b/bench/benches/bench.rs
@@ -51,6 +51,8 @@ mod spinlock_fallback;
 
 const THREADS: usize = 2;
 const N: u32 = 5000;
+const ORDERING: Ordering = Ordering::AcqRel;
+// const ORDERING: Ordering = Ordering::SeqCst;
 
 trait AtomicInt<T: Copy>: Sized + Send + Sync {
     fn new(v: T) -> Self;
@@ -69,24 +71,32 @@ macro_rules! impl_atomic {
             }
             #[inline]
             fn load_(&self) -> $int_type {
-                self.load(Ordering::Acquire)
+                self.load(if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING })
             }
             #[inline]
             fn store_(&self, val: $int_type) {
-                self.store(val, Ordering::Release);
+                self.store(
+                    val,
+                    if ORDERING == Ordering::AcqRel { Ordering::Release } else { ORDERING },
+                );
             }
             #[inline]
             fn swap_(&self, val: $int_type) -> $int_type {
-                self.swap(val, Ordering::AcqRel)
+                self.swap(val, ORDERING)
             }
             #[inline]
             fn compare_exchange_(&self, old: $int_type, new: $int_type) -> $int_type {
-                self.compare_exchange(old, new, Ordering::AcqRel, Ordering::Acquire)
-                    .unwrap_or_else(|x| x)
+                self.compare_exchange(
+                    old,
+                    new,
+                    ORDERING,
+                    if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING },
+                )
+                .unwrap_or_else(|x| x)
             }
             #[inline]
             fn fetch_add_(&self, val: $int_type) -> $int_type {
-                self.fetch_add(val, Ordering::AcqRel)
+                self.fetch_add(val, ORDERING)
             }
         }
     };
@@ -97,6 +107,7 @@ macro_rules! impl_atomic_no_order {
         impl AtomicInt<$int_type> for $atomic_type {
             #[inline]
             fn new(v: $int_type) -> Self {
+                assert_eq!(ORDERING, Ordering::AcqRel);
                 Self::new(v)
             }
             #[inline]
diff --git a/src/imp/atomic128/x86_64.rs b/src/imp/atomic128/x86_64.rs
index 8d06a814c..2d8454b62 100644
--- a/src/imp/atomic128/x86_64.rs
+++ b/src/imp/atomic128/x86_64.rs
@@ -68,6 +68,23 @@ macro_rules! ptr_modifier {
     };
 }
 
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+#[cfg(target_pointer_width = "32")]
+macro_rules! sp {
+    () => {
+        "esp"
+    };
+}
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+#[cfg(target_pointer_width = "64")]
+macro_rules! sp {
+    () => {
+        "rsp"
+    };
+}
+
 // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
 // requirements for the currently enabled target features. In the first place,
 // there is no option in the x86 assembly for such case, like ARM .arch_extension,
@@ -185,10 +202,14 @@ unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
             Ordering::SeqCst => {
                 asm!(
                     concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
-                    "mfence",
+                    // Equivalent to mfence, but is 10-35% faster at least in simple cases on Coffee Lake (https://github.com/taiki-e/portable-atomic/pull/156).
+                    // Based on x86 64-bit atomic SeqCst store using SSE generated by LLVM.
+                    // https://godbolt.org/z/9sKEr8YWc
+                    concat!("lock or dword ptr [", sp!(), "], 0"),
                     dst = in(reg) dst,
                     val = in(xmm_reg) val,
-                    options(nostack, preserves_flags),
+                    // Do not use `preserves_flags` because OR modifies the OF, CF, SF, ZF, and PF flags.
+                    options(nostack),
                 );
             }
             _ => unreachable!(),