From 6519b13ce15a99ef3e9273139a55c6fe7268a909 Mon Sep 17 00:00:00 2001
From: niluxv <niluxv.opensource.C-h2ty6xl@yandex.com>
Date: Thu, 2 May 2024 10:30:57 +0200
Subject: [PATCH] Introduce zeroizer based on (non-volatile) `write_bytes` and
 `asm!` optimisation barier and remove many slower ones

*Breaking:* Removes many zeroizers previously available.

This new zeroizer cannot be optimised out according to my current understanding of the op.sem. of Rust.
It can be implemented on stable, is target-independent, and faster than all our handwritten ones.
Therefore we can remove all the handwritten target-specific ones. This is good, since their implementation
turned out to be prone to mistakes.
---
 Cargo.toml                              |   1 -
 benches/bench_zeroizers.rs              |  25 +-
 src/internals/zeroize.rs                | 233 +++++++----------
 src/internals/zeroize/asm_x86_64.rs     | 326 ------------------------
 src/internals/zeroize/system.rs         | 104 --------
 src/internals/zeroize/volatile_write.rs | 100 --------
 src/zeroize.rs                          | 289 ++-------------------
 src/zeroize/tests.rs                    |  80 ------
 8 files changed, 116 insertions(+), 1042 deletions(-)
 delete mode 100644 src/internals/zeroize/asm_x86_64.rs
 delete mode 100644 src/internals/zeroize/system.rs
 delete mode 100644 src/internals/zeroize/volatile_write.rs

diff --git a/Cargo.toml b/Cargo.toml
index 251ec1e..0824851 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,7 +38,6 @@ dev = ["std"]
 [dependencies]
 allocator-api2 = { version = "0.2", default-features = false }
 cfg-if = "1.0"
-libc = "0.2"
 mirai-annotations = "1.12"
 sptr = "0.3"
 thiserror = { version = "1.0", optional = true }
diff --git a/benches/bench_zeroizers.rs b/benches/bench_zeroizers.rs
index 2c62483..47ac729 100644
--- a/benches/bench_zeroizers.rs
+++ b/benches/bench_zeroizers.rs
@@ -1,13 +1,6 @@
 use criterion::{criterion_group, criterion_main, Criterion};
-#[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-use secmem_alloc::zeroize::AsmRepStosZeroizer;
-#[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
-use secmem_alloc::zeroize::X86_64AvxZeroizer;
-#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-use secmem_alloc::zeroize::X86_64Sse2Zeroizer;
 use secmem_alloc::zeroize::{
-    LibcZeroizer, MemZeroizer, VolatileMemsetZeroizer, VolatileWrite8Zeroizer,
-    VolatileWriteZeroizer,
+    MemZeroizer, MemsetAsmBarierZeroizer, VolatileMemsetZeroizer, VolatileWrite8Zeroizer,
 };
 
 fn zeroize_b127<Z: MemZeroizer>(z: Z, array: &mut [u8; 127]) {
@@ -50,21 +43,9 @@ macro_rules! bench_zeroizers {
         $cgroup.bench_function("VolatileMemsetZeroizer", |b| {
             b.iter(|| $bench_function(VolatileMemsetZeroizer, &mut $array.0))
         });
-        $cgroup.bench_function("LibcZeroizer", |b| {
-            b.iter(|| $bench_function(LibcZeroizer, &mut $array.0))
+        $cgroup.bench_function("MemsetAsmBarierZeroizer", |b| {
+            b.iter(|| $bench_function(MemsetAsmBarierZeroizer, &mut $array.0))
         });
-        $cgroup.bench_function("VolatileWriteZeroizer", |b| {
-            b.iter(|| $bench_function(VolatileWriteZeroizer, &mut $array.0))
-        });
-        $cgroup.bench_function("VolatileWrite8Zeroizer", |b| {
-            b.iter(|| $bench_function(VolatileWrite8Zeroizer, &mut $array.0))
-        });
-        #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-        {
-            $cgroup.bench_function("X86_64Sse2Zeroizer", |b| {
-                b.iter(|| $bench_function(X86_64Sse2Zeroizer, &mut $array.0))
-            });
-        }
         #[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
         {
             $cgroup.bench_function("X86_64AvxZeroizer", |b| {
diff --git a/src/internals/zeroize.rs b/src/internals/zeroize.rs
index 65bb31d..c2da080 100644
--- a/src/internals/zeroize.rs
+++ b/src/internals/zeroize.rs
@@ -1,35 +1,103 @@
 //! Utility functions for securely wiping memory.
 //!
-//! Contains wrappers around intrinsics and ffi functions necessary for the
-//! [`crate::zeroize`] module.
+//! Utility functions for the [`crate::zeroize`] module, to securely wiping
+//! memory, implemented using cross-platform pure Rust volatile writes.
 
-#[cfg(target_arch = "x86_64")]
-mod asm_x86_64;
-#[cfg(target_arch = "x86_64")]
-pub use asm_x86_64::*;
+use crate::macros::precondition_memory_range;
+use crate::util::is_aligned_ptr_mut;
+use mirai_annotations::debug_checked_precondition;
 
-mod system;
-pub use system::*;
+/// Zeroize the memory pointed to by `ptr` and of size `len` bytes, by
+/// overwriting it byte for byte using volatile writes.
+///
+/// This is guarantied to be not elided by the compiler.
+///
+/// # Safety
+/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
+/// the [`std::ptr`] documentation. In particular this function is not atomic.
+pub unsafe fn volatile_write_zeroize(mut ptr: *mut u8, len: usize) {
+    precondition_memory_range!(ptr, len);
+    for _i in 0..len {
+        // SAFETY: `ptr` originally pointed into an allocation of `len` bytes so now,
+        // after `_i` steps `len - _i > 0` bytes are left, so `ptr` is valid for
+        // a byte write
+        unsafe {
+            core::ptr::write_volatile(ptr, 0u8);
+        }
+        // SAFETY: after increment, `ptr` points into the same allocation if `_i == len`
+        // or one byte past it, so `add` is sound
+        ptr = unsafe { ptr.add(1) };
+    }
+}
 
-mod volatile_write;
-pub use volatile_write::*;
+/// Zeroize the memory pointed to by `ptr` for `len` rounded down to a multiple
+/// of 8 bytes.
+///
+/// This function rounds down `len` to a multiple of 8 and then zeroizes the
+/// memory pointed to by `ptr` for that length. This operation is guarantied to
+/// be not elided by the compiler. If `len` is a multiple of 8 then this
+/// zeroizes the entire specified block of memory. Returns a pointer to the byte
+/// after the last zeroed byte, with the provenance of `ptr`.
+///
+/// # Safety
+/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
+/// the [`std::ptr`] documentation. In particular this function is not atomic.
+///
+/// Furthermore, `ptr` *must* be at least 8 byte aligned.
+pub unsafe fn zeroize_align8_block8(mut ptr: *mut u8, len: usize) -> *mut u8 {
+    precondition_memory_range!(ptr, len);
+    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 8));
+
+    let nblocks = (len - len % 8) / 8;
+    for _i in 0..nblocks {
+        // SAFETY: `ptr` originally pointed into an allocation of `len` bytes so now,
+        // after `_i` steps `len - 8*_i >= 8` bytes are left, so `ptr` is valid
+        // for an 8 byte write SAFETY: `ptr` was originally 8 byte aligned by
+        // caller contract and we only added a multiple of 8 so it is still 8
+        // byte aligned
+        unsafe {
+            core::ptr::write_volatile(ptr.cast::<u64>(), 0u64);
+        }
+        // SAFETY: after increment, `ptr` points into the same allocation or (if `8*_i
+        // == len`) at most one byte past it, so `add` is sound; `ptr` stays 8
+        // byte aligned
+        ptr = unsafe { ptr.add(8) };
+    }
+    ptr
+}
 
-/// Volatile write byte to memory.
+/// Zeroize the memory pointed to by `ptr` and of size `len % 8` bytes.
 ///
-/// This uses the [`core::intrinsics::volatile_set_memory`] intrinsic and can
-/// only be used on nightly, with the `nightly` feature enabled.
+/// This can be used to zeroize the bytes left unzeroized by
+/// `zeroize_align8_block8` if `len` is not a multiple of 8. This operation is
+/// guarantied to be not elided by the compiler.
 ///
 /// # Safety
 /// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
 /// the [`std::ptr`] documentation. In particular this function is not atomic.
-// In addition `ptr` needs to be properly aligned, but because we are talking
-// about bytes (therefore byte alignment), it *always* is.
-#[cfg(feature = "nightly_core_intrinsics")]
-pub unsafe fn volatile_memset(ptr: *mut u8, val: u8, len: usize) {
-    crate::macros::precondition_memory_range!(ptr, len);
-    // SAFETY: the caller must uphold the safety contract
-    unsafe {
-        core::intrinsics::volatile_set_memory(ptr, val, len);
+///
+/// Furthermore, `ptr` *must* be at least 4 byte aligned.
+pub unsafe fn zeroize_align4_tail8(mut ptr: *mut u8, len: usize) {
+    precondition_memory_range!(ptr, len % 8);
+    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 4));
+
+    if len % 8 >= 4 {
+        // SAFETY: `ptr` is valid for `len % 8` bytes by caller contract
+        // SAFETY: `ptr` is still 4 byte aligned by caller contract
+        unsafe {
+            core::ptr::write_volatile(ptr.cast::<u32>(), 0u32);
+        }
+        ptr = unsafe { ptr.add(4) };
+    }
+    // the final remainder (at most 3 bytes) is zeroed byte-for-byte
+    // SAFETY: `ptr` has been incremented by a multiple of 4 <= `len` so `ptr`
+    // points to an allocation of `len % 4` bytes, so `ptr` can be written to
+    // and incremented `len % 4` times
+    for _i in 0..(len % 4) {
+        unsafe {
+            core::ptr::write_volatile(ptr, 0u8);
+        }
+        ptr = unsafe { ptr.add(1) };
     }
 }
 
@@ -76,135 +144,18 @@ mod tests {
         assert_eq!(&array[..], &expected[..]);
     }
 
-    #[cfg(feature = "nightly_core_intrinsics")]
-    #[test]
-    fn test_volatile_memset() {
-        test_b128_zeroizer(|ptr: *mut u8, len: usize| unsafe { volatile_memset(ptr, 0, len) })
-    }
-
-    #[cfg(any(
-        target_os = "freebsd",
-        target_os = "dragonfly",
-        target_os = "openbsd",
-        target_os = "netbsd",
-        target_os = "macos",
-        target_os = "ios",
-        target_env = "gnu",
-        target_env = "musl"
-    ))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // ffi
-    fn test_explicit_bzero() {
-        test_b128_zeroizer(|ptr: *mut u8, len: usize| unsafe { libc_explicit_bzero(ptr, len) })
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // asm
-    fn test_asm_ermsb_zeroize() {
-        test_b128_zeroizer(|ptr: *mut u8, len: usize| unsafe { asm_ermsb_zeroize(ptr, len) })
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
     #[test]
     fn test_volatile_write_zeroize() {
-        test_b128_zeroizer(|ptr: *mut u8, len: usize| unsafe { volatile_write_zeroize(ptr, len) })
+        test_b128_zeroizer(|ptr, len| unsafe { volatile_write_zeroize(ptr, len) })
     }
 
-    #[cfg(feature = "nightly_core_intrinsics")]
     #[test]
-    fn test_volatile_memset_lowalign() {
-        test_b239_lowalign_zeroizer(|ptr: *mut u8, len: usize| unsafe {
-            volatile_memset(ptr, 0, len)
-        })
-    }
-
-    #[cfg(any(
-        target_os = "freebsd",
-        target_os = "dragonfly",
-        target_os = "openbsd",
-        target_os = "netbsd",
-        target_os = "macos",
-        target_os = "ios",
-        target_env = "gnu",
-        target_env = "musl"
-    ))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // ffi
-    fn test_explicit_bzero_lowalign() {
-        test_b239_lowalign_zeroizer(|ptr: *mut u8, len: usize| unsafe {
-            libc_explicit_bzero(ptr, len)
-        })
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // asm
-    fn test_asm_ermsb_zeroize_lowalign() {
-        test_b239_lowalign_zeroizer(|ptr: *mut u8, len: usize| unsafe {
-            asm_ermsb_zeroize(ptr, len)
-        })
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-    #[test]
-    fn test_volatile_write_zeroize_lowalign() {
-        test_b239_lowalign_zeroizer(|ptr: *mut u8, len: usize| unsafe {
-            volatile_write_zeroize(ptr, len)
-        })
+    fn test_lowalign_volatile_write_zeroize() {
+        test_b239_lowalign_zeroizer(|ptr, len| unsafe { volatile_write_zeroize(ptr, len) })
     }
 
     #[test]
     fn test_zeroize_align8_block8() {
         test_b257_align64_block_zeroizer(|ptr, len| unsafe { zeroize_align8_block8(ptr, len) })
     }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // asm
-    fn test_x86_64_simd16_zeroize_align16_block16() {
-        test_b257_align64_block_zeroizer(|ptr, len| unsafe {
-            x86_64_simd16_zeroize_align16_block16(ptr, len)
-        })
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // asm
-    fn test_x86_64_simd16_unroll2_zeroize_align16_block16() {
-        test_b257_align64_block_zeroizer(|ptr, len| unsafe {
-            x86_64_simd16_unroll2_zeroize_align16_block16(ptr, len)
-        })
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // asm
-    fn test_x86_64_simd32_zeroize_align32_block32() {
-        test_b257_align64_block_zeroizer(|ptr, len| unsafe {
-            x86_64_simd32_zeroize_align32_block32(ptr, len)
-        })
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // asm
-    fn test_x86_64_simd32_unroll2_zeroize_align32_block32() {
-        test_b257_align64_block_zeroizer(|ptr, len| unsafe {
-            x86_64_simd32_unroll2_zeroize_align32_block32(ptr, len)
-        })
-    }
-
-    #[cfg(all(
-        target_arch = "x86_64",
-        target_feature = "avx512f",
-        feature = "nightly_stdsimd"
-    ))]
-    #[test]
-    #[cfg_attr(miri, ignore)] // asm
-    fn test_x86_64_simd64_zeroize_align64_block64() {
-        test_b257_align64_block_zeroizer(|ptr, len| unsafe {
-            x86_64_simd64_zeroize_align64_block64(ptr, len)
-        })
-    }
 }
diff --git a/src/internals/zeroize/asm_x86_64.rs b/src/internals/zeroize/asm_x86_64.rs
deleted file mode 100644
index 69c3113..0000000
--- a/src/internals/zeroize/asm_x86_64.rs
+++ /dev/null
@@ -1,326 +0,0 @@
-//! Utility functions for securely wiping memory, implemented in asm for x86_64
-//! cpus.
-
-use crate::macros::precondition_memory_range;
-use crate::util::is_aligned_ptr_mut;
-use mirai_annotations::debug_checked_precondition;
-
-/// Overwrite memory with zeros. This operation will not be elided by the
-/// compiler.
-///
-/// This uses inline assembly in Rust. The implementation makes use of the
-/// efficient `rep stosb` memory set functionality on modern x86_64 cpus. This
-/// is very slow for small amounts of data but very efficient for zeroizing
-/// large amounts of data (depending an CPU architecture though), works on
-/// stable, and does not require a libc.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-// In addition `ptr` needs to be properly aligned, but because we are talking
-// about bytes (therefore byte alignment), it *always* is.
-#[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-pub unsafe fn asm_ermsb_zeroize(ptr: *mut u8, len: usize) {
-    precondition_memory_range!(ptr, len);
-
-    unsafe {
-        core::arch::asm!(
-            "rep stosb byte ptr es:[rdi], al",
-            // `len` in the rcx register
-            inout("rcx") len => _,
-            // `ptr` int the rdi register
-            inout("rdi") ptr => _,
-            // zero byte to al (first byte of rax) register
-            in("al") 0u8,
-            options(nostack),
-        );
-    }
-}
-
-/// Zeroize the memory pointed to by `ptr` for `len` rounded down to a multiple
-/// of 16 bytes.
-///
-/// This function rounds down `len` to a multiple of 16 and then zeroizes the
-/// memory pointed to by `ptr` for that length. This operation is guarantied to
-/// be not elided by the compiler. If `len` is a multiple of 16 then this
-/// zeroizes the entire specified block of memory. Returns a pointer to the byte
-/// after the last zeroed byte, with the provenance of `ptr`.
-///
-/// This uses sse2 instructions in inline asm to zeroize the memory with blocks
-/// of 16 bytes at a time.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-///
-/// Furthermore, `ptr` *must* be at least 16 byte aligned.
-#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-pub unsafe fn x86_64_simd16_zeroize_align16_block16(mut ptr: *mut u8, len: usize) -> *mut u8 {
-    use core::arch::x86_64 as arch;
-
-    precondition_memory_range!(ptr, len);
-    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 16));
-
-    let nblocks = (len - len % 16) / 16;
-
-    for _i in 0..nblocks {
-        // SAFETY: `ptr` is valid for a `len >= nblocks*16` byte write, so we can write
-        // `nblocks` times 16 bytes and increment `ptr` by 16 bytes; `ptr` stays 16 byte
-        // aligned
-        unsafe {
-            // SAFETY: `ptr` originally pointed into an allocation of `len` bytes so now,
-            // after `_i` steps `len - 16*_i >= 16` bytes are left, so `ptr` is valid
-            // for a 16 byte write; also `ptr` is 16 byte aligned
-            core::arch::asm!(
-                "
-                /* write 16 zero bytes to ptr */
-                vmovdqa xmmword ptr [{0}], {1}
-                ",
-                in(reg) ptr,
-                in(xmm_reg) arch::_mm_setzero_si128(),
-                options(nostack),
-            );
-            // NOTE: increment `ptr` outside of the asm to maintain provenance
-            // SAFETY: this stays within the memory where `ptr` is valid for writes and
-            // maintains 16 byte alignment
-            ptr = ptr.add(16);
-        }
-    }
-    ptr
-}
-
-/// Zeroize the memory pointed to by `ptr` for `len` rounded down to a multiple
-/// of 16 bytes.
-///
-/// This function rounds down `len` to a multiple of 16 and then zeroizes the
-/// memory pointed to by `ptr` for that length. This operation is guarantied to
-/// be not elided by the compiler. If `len` is a multiple of 16 then this
-/// zeroizes the entire specified block of memory. Returns a pointer to the byte
-/// after the last zeroed byte, with the provenance of `ptr`.
-///
-/// This uses sse2 instructions in inline asm to zeroize the memory with blocks
-/// of 16 bytes at a time.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-///
-/// Furthermore, `ptr` *must* be at least 16 byte aligned.
-#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-pub unsafe fn x86_64_simd16_unroll2_zeroize_align16_block16(
-    mut ptr: *mut u8,
-    len: usize,
-) -> *mut u8 {
-    use core::arch::x86_64 as arch;
-
-    precondition_memory_range!(ptr, len);
-    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 16));
-
-    let nblocks = (len - len % 16) / 16;
-
-    // SAFETY: `ptr` is valid for a `len >= nblocks*16` byte write, so we can write
-    // `nblocks` times 16 bytes and increment `ptr` by 16 bytes; `ptr` stays 16 byte
-    // aligned
-    for _i in 0..nblocks / 2 {
-        unsafe {
-            core::arch::asm!(
-                "
-                /* write 16 zero bytes to ptr */
-                vmovdqa xmmword ptr [{0}], {1}
-                vmovdqa xmmword ptr [{0} + 16], {1}
-                ",
-                in(reg) ptr,
-                in(xmm_reg) arch::_mm_setzero_si128(),
-                options(nostack),
-            );
-            ptr = ptr.add(32);
-        }
-    }
-    if nblocks % 2 == 1 {
-        unsafe {
-            core::arch::asm!(
-                "
-                /* write 16 zero bytes to ptr */
-                vmovdqa xmmword ptr [{0}], {1}
-                ",
-                in(reg) ptr,
-                in(xmm_reg) arch::_mm_setzero_si128(),
-                options(nostack),
-            );
-            ptr = ptr.add(16);
-        }
-    }
-    ptr
-}
-
-/// Zeroize the memory pointed to by `ptr` for `len` rounded down to a multiple
-/// of 32 bytes.
-///
-/// This function rounds down `len` to a multiple of 32 and then zeroizes the
-/// memory pointed to by `ptr` for that length. This operation is guarantied to
-/// be not elided by the compiler. If `len` is a multiple of 32 then this
-/// zeroizes the entire specified block of memory. Returns a pointer to the byte
-/// after the last zeroed byte, with the provenance of `ptr`.
-///
-/// This uses avx2 instructions in inline asm to zeroize the memory with blocks
-/// of 32 bytes at a time.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-///
-/// Furthermore, `ptr` *must* be at least 32 byte aligned.
-#[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
-pub unsafe fn x86_64_simd32_zeroize_align32_block32(mut ptr: *mut u8, len: usize) -> *mut u8 {
-    use core::arch::x86_64 as arch;
-
-    precondition_memory_range!(ptr, len);
-    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 32));
-
-    let nblocks = (len - len % 32) / 32;
-
-    for _i in 0..nblocks {
-        // SAFETY: `ptr` is valid for a `len >= nblocks*32` byte write, so we can write
-        // `nblocks` times 32 bytes and increment `ptr` by 32 bytes; `ptr` stays 32 byte
-        // aligned
-        unsafe {
-            // SAFETY: `ptr` originally pointed into an allocation of `len` bytes so now,
-            // after `_i` steps `len - 32*_i >= 32` bytes are left, so `ptr` is valid
-            // for a 32 byte write; also `ptr` is 32 byte aligned
-            core::arch::asm!(
-                "
-                /* write 32 zero bytes to ptr */
-                vmovdqa ymmword ptr [{0}], {1}
-                ",
-                in(reg) ptr,
-                in(ymm_reg) arch::_mm256_setzero_si256(),
-                options(nostack),
-            );
-            // NOTE: increment `ptr` outside of the asm to maintain provenance
-            // SAFETY: this stays within the memory where `ptr` is valid for writes and
-            // maintains 32 byte alignment
-            ptr = ptr.add(32);
-        }
-    }
-    ptr
-}
-
-/// Zeroize the memory pointed to by `ptr` for `len` rounded down to a multiple
-/// of 32 bytes.
-///
-/// This function rounds down `len` to a multiple of 32 and then zeroizes the
-/// memory pointed to by `ptr` for that length. This operation is guarantied to
-/// be not elided by the compiler. If `len` is a multiple of 32 then this
-/// zeroizes the entire specified block of memory. Returns a pointer to the byte
-/// after the last zeroed byte, with the provenance of `ptr`.
-///
-/// This uses avx2 instructions in inline asm to zeroize the memory with blocks
-/// of 32 bytes at a time.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-///
-/// Furthermore, `ptr` *must* be at least 32 byte aligned.
-#[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
-pub unsafe fn x86_64_simd32_unroll2_zeroize_align32_block32(
-    mut ptr: *mut u8,
-    len: usize,
-) -> *mut u8 {
-    use core::arch::x86_64 as arch;
-
-    precondition_memory_range!(ptr, len);
-    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 32));
-
-    let nblocks = (len - len % 32) / 32;
-
-    // SAFETY: `ptr` is valid for a `len >= nblocks*32` byte write, so we can write
-    // `nblocks` times 32 bytes and increment `ptr` by 32 bytes; `ptr` stays 32 byte
-    // aligned
-    for _i in 0..(nblocks / 2) {
-        unsafe {
-            core::arch::asm!(
-                "
-                /* write 64 zero bytes to ptr */
-                vmovdqa ymmword ptr [{0}], {1}
-                vmovdqa ymmword ptr [{0} + 32], {1}
-                ",
-                in(reg) ptr,
-                in(ymm_reg) arch::_mm256_setzero_si256(),
-                options(nostack),
-            );
-            ptr = ptr.add(64);
-        }
-    }
-    if nblocks % 2 == 1 {
-        unsafe {
-            core::arch::asm!(
-                "
-                /* write 32 zero bytes to ptr */
-                vmovdqa ymmword ptr [{0}], {1}
-                ",
-                in(reg) ptr,
-                in(ymm_reg) arch::_mm256_setzero_si256(),
-                options(nostack),
-            );
-            ptr = ptr.add(32);
-        }
-    }
-    ptr
-}
-
-/// Zeroize the memory pointed to by `ptr` for `len` rounded down to a multiple
-/// of 64 bytes.
-///
-/// This function rounds down `len` to a multiple of 64 and then zeroizes the
-/// memory pointed to by `ptr` for that length. This operation is guarantied to
-/// be not elided by the compiler. If `len` is a multiple of 64 then this
-/// zeroizes the entire specified block of memory. Returns a pointer to the byte
-/// after the last zeroed byte, with the provenance of `ptr`.
-///
-/// This uses avx512 instructions in inline asm to zeroize the memory with
-/// blocks of 64 bytes at a time.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-///
-/// Furthermore, `ptr` *must* be at least 64 byte aligned.
-#[cfg(all(
-    target_arch = "x86_64",
-    target_feature = "avx512f",
-    feature = "nightly_stdsimd"
-))]
-pub unsafe fn x86_64_simd64_zeroize_align64_block64(mut ptr: *mut u8, len: usize) -> *mut u8 {
-    use core::arch::x86_64 as arch;
-
-    precondition_memory_range!(ptr, len);
-    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 64));
-
-    let nblocks = (len - len % 64) / 64;
-
-    for _i in 0..nblocks {
-        // SAFETY: `ptr` is valid for a `len >= nblocks*64` byte write, so we can write
-        // `nblocks` times 64 bytes and increment `ptr` by 64 bytes; `ptr` stays 64 byte
-        // aligned
-        unsafe {
-            // SAFETY: `ptr` originally pointed into an allocation of `len` bytes so now,
-            // after `_i` steps `len - 64*_i >= 64` bytes are left, so `ptr` is valid
-            // for a 64 byte write; also `ptr` is 64 byte aligned
-            core::arch::asm!(
-                "
-                /* write 64 zero bytes to ptr */
-                vmovdqa64 zmmword ptr [{0}], {1}
-                ",
-                in(reg) ptr,
-                in(zmm_reg) arch::_mm512_setzero_si512(),
-                options(nostack),
-            );
-            // NOTE: increment `ptr` outside of the asm to maintain provenance
-            // SAFETY: this stays within the memory where `ptr` is valid for writes and
-            // maintains 64 byte alignment
-            ptr = ptr.add(64);
-        }
-    }
-    ptr
-}
diff --git a/src/internals/zeroize/system.rs b/src/internals/zeroize/system.rs
deleted file mode 100644
index 287a595..0000000
--- a/src/internals/zeroize/system.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-//! Bindings to system functions for securely wiping memory.
-
-use crate::macros::precondition_memory_range;
-
-/// Overwrite memory with zeros. This operation will not be elided by the
-/// compiler.
-///
-/// This uses the `explicit_bzero` function present in many recent libcs.
-///
-/// # Safety
-/// It's C. But the safety requirement is quite obvious: The caller *must*
-/// ensure that `ptr` is valid for writes of `len` bytes, see the [`std::ptr`]
-/// documentation. In particular this function is not atomic.
-// In addition `ptr` needs to be properly aligned, but because we are talking
-// about bytes (therefore byte alignment), it *always* is.
-#[cfg(any(
-    target_os = "freebsd",
-    target_os = "dragonfly",
-    target_os = "openbsd",
-    all(target_env = "gnu", unix),
-    target_env = "musl"
-))]
-pub unsafe fn libc_explicit_bzero(ptr: *mut u8, len: usize) {
-    precondition_memory_range!(ptr, len);
-    // SAFETY: the caller must uphold the safety contract
-    unsafe {
-        libc::explicit_bzero(ptr as *mut libc::c_void, len as libc::size_t);
-    }
-}
-
-/// Overwrite memory with zeros. This operation will not be elided by the
-/// compiler.
-///
-/// This uses the `explicit_bzero` function present in many recent libcs.
-///
-/// # Safety
-/// It's C. But the safety requirement is quite obvious: The caller *must*
-/// ensure that `ptr` is valid for writes of `len` bytes, see the [`std::ptr`]
-/// documentation. In particular this function is not atomic.
-// In addition `ptr` needs to be properly aligned, but because we are talking
-// about bytes (therefore byte alignment), it *always* is.
-#[cfg(all(target_env = "gnu", windows))]
-pub unsafe fn libc_explicit_bzero(ptr: *mut u8, len: usize) {
-    precondition_memory_range!(ptr, len);
-    extern "C" {
-        fn explicit_bzero(ptr: *mut libc::c_void, len: libc::size_t);
-    }
-
-    // SAFETY: the caller must uphold the safety contract
-    unsafe {
-        explicit_bzero(ptr as *mut libc::c_void, len as libc::size_t);
-    }
-}
-
-/// Overwrite memory with zeros. This operation will not be elided by the
-/// compiler.
-///
-/// This uses the `explicit_bzero` function present in many recent libcs.
-///
-/// # Safety
-/// It's C. But the safety requirement is quite obvious: The caller *must*
-/// ensure that `ptr` is valid for writes of `len` bytes, see the [`std::ptr`]
-/// documentation. In particular this function is not atomic.
-// In addition `ptr` needs to be properly aligned, but because we are talking
-// about bytes (therefore byte alignment), it *always* is.
-#[cfg(target_os = "netbsd")]
-pub unsafe fn libc_explicit_bzero(ptr: *mut u8, len: usize) {
-    precondition_memory_range!(ptr, len);
-    // SAFETY: the caller must uphold the safety contract
-    unsafe {
-        libc::explicit_memset(
-            ptr as *mut libc::c_void,
-            0 as libc::c_int,
-            len as libc::size_t,
-        );
-    }
-}
-
-/// Overwrite memory with zeros. This operation will not be elided by the
-/// compiler.
-///
-/// This uses the `explicit_bzero` function present in many recent libcs.
-///
-/// # Safety
-/// It's C. But the safety requirement is quite obvious: The caller *must*
-/// ensure that `ptr` is valid for writes of `len` bytes, see the [`std::ptr`]
-/// documentation. In particular this function is not atomic.
-// In addition `ptr` needs to be properly aligned, but because we are talking
-// about bytes (therefore byte alignment), it *always* is.
-#[cfg(any(target_os = "macos", target_os = "ios"))]
-pub unsafe fn libc_explicit_bzero(ptr: *mut u8, len: usize) {
-    precondition_memory_range!(ptr, len);
-    // SAFETY: the caller must uphold the safety contract
-    unsafe {
-        // the zero value is a `c_int` (`i32` by default), but then converted to
-        // `unsigned char` (`u8`)
-        libc::memset_s(
-            ptr as *mut libc::c_void,
-            len as libc::size_t,
-            0 as libc::c_int,
-            len as libc::size_t,
-        );
-    }
-}
diff --git a/src/internals/zeroize/volatile_write.rs b/src/internals/zeroize/volatile_write.rs
deleted file mode 100644
index 674356e..0000000
--- a/src/internals/zeroize/volatile_write.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-//! Utility functions for securely wiping memory, implemented using
-//! cross-platform volatile writes.
-
-use crate::macros::precondition_memory_range;
-use crate::util::is_aligned_ptr_mut;
-use mirai_annotations::debug_checked_precondition;
-
-/// Zeroize the memory pointed to by `ptr` and of size `len` bytes, by
-/// overwriting it byte for byte using volatile writes.
-///
-/// This is guarantied to be not elided by the compiler.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-pub unsafe fn volatile_write_zeroize(mut ptr: *mut u8, len: usize) {
-    precondition_memory_range!(ptr, len);
-    for _i in 0..len {
-        // SAFETY: `ptr` originally pointed into an allocation of `len` bytes so now,
-        // after `_i` steps `len - _i > 0` bytes are left, so `ptr` is valid for
-        // a byte write
-        unsafe {
-            core::ptr::write_volatile(ptr, 0u8);
-        }
-        // SAFETY: after increment, `ptr` points into the same allocation if `_i == len`
-        // or one byte past it, so `add` is sound
-        ptr = unsafe { ptr.add(1) };
-    }
-}
-
-/// Zeroize the memory pointed to by `ptr` for `len` rounded down to a multiple
-/// of 8 bytes.
-///
-/// This function rounds down `len` to a multiple of 8 and then zeroizes the
-/// memory pointed to by `ptr` for that length. This operation is guarantied to
-/// be not elided by the compiler. If `len` is a multiple of 8 then this
-/// zeroizes the entire specified block of memory. Returns a pointer to the byte
-/// after the last zeroed byte, with the provenance of `ptr`.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-///
-/// Furthermore, `ptr` *must* be at least 8 byte aligned.
-pub unsafe fn zeroize_align8_block8(mut ptr: *mut u8, len: usize) -> *mut u8 {
-    precondition_memory_range!(ptr, len);
-    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 8));
-
-    let nblocks = (len - len % 8) / 8;
-    for _i in 0..nblocks {
-        // SAFETY: `ptr` originally pointed into an allocation of `len` bytes so now,
-        // after `_i` steps `len - 8*_i >= 8` bytes are left, so `ptr` is valid
-        // for an 8 byte write SAFETY: `ptr` was originally 8 byte aligned by
-        // caller contract and we only added a multiple of 8 so it is still 8
-        // byte aligned
-        unsafe {
-            core::ptr::write_volatile(ptr.cast::<u64>(), 0u64);
-        }
-        // SAFETY: after increment, `ptr` points into the same allocation or (if `8*_i
-        // == len`) at most one byte past it, so `add` is sound; `ptr` stays 8
-        // byte aligned
-        ptr = unsafe { ptr.add(8) };
-    }
-    ptr
-}
-
-/// Zeroize the memory pointed to by `ptr` and of size `len % 8` bytes.
-///
-/// This can be used to zeroize the bytes left unzeroized by
-/// `zeroize_align8_block8` if `len` is not a multiple of 8. This operation is
-/// guarantied to be not elided by the compiler.
-///
-/// # Safety
-/// The caller *must* ensure that `ptr` is valid for writes of `len` bytes, see
-/// the [`std::ptr`] documentation. In particular this function is not atomic.
-///
-/// Furthermore, `ptr` *must* be at least 4 byte aligned.
-pub unsafe fn zeroize_align4_tail8(mut ptr: *mut u8, len: usize) {
-    precondition_memory_range!(ptr, len % 8);
-    debug_checked_precondition!(is_aligned_ptr_mut(ptr, 4));
-
-    if len % 8 >= 4 {
-        // SAFETY: `ptr` is valid for `len % 8` bytes by caller contract
-        // SAFETY: `ptr` is still 4 byte aligned by caller contract
-        unsafe {
-            core::ptr::write_volatile(ptr.cast::<u32>(), 0u32);
-        }
-        ptr = unsafe { ptr.add(4) };
-    }
-    // the final remainder (at most 3 bytes) is zeroed byte-for-byte
-    // SAFETY: `ptr` has been incremented by a multiple of 4 <= `len` so `ptr`
-    // points to an allocation of `len % 4` bytes, so `ptr` can be written to
-    // and incremented `len % 4` times
-    for _i in 0..(len % 4) {
-        unsafe {
-            core::ptr::write_volatile(ptr, 0u8);
-        }
-        ptr = unsafe { ptr.add(1) };
-    }
-}
diff --git a/src/zeroize.rs b/src/zeroize.rs
index 3c2f8b1..7be5e02 100644
--- a/src/zeroize.rs
+++ b/src/zeroize.rs
@@ -72,43 +72,9 @@ cfg_if::cfg_if! {
         /// the selected features and the version of this library.
         pub type DefaultMemZeroizer = VolatileMemsetZeroizer;
         pub(crate) use VolatileMemsetZeroizer as DefaultMemZeroizerConstructor;
-    } else if #[cfg(any(
-        target_os = "freebsd",
-        target_os = "dragonfly",
-        target_os = "openbsd",
-        target_os = "netbsd",
-        target_os = "macos",
-        target_os = "ios",
-        target_env = "gnu",
-        target_env = "musl"
-    ))] {
-        /// Best (i.e. fastest) [`MemZeroizer`] available for the target.
-        ///
-        /// Which [`MemZeroizer`] this is is an implementation detail, can depend on the target and
-        /// the selected features and the version of this library.
-        pub type DefaultMemZeroizer = LibcZeroizer;
-        pub(crate) use LibcZeroizer as DefaultMemZeroizerConstructor;
-    } else if #[cfg(all(target_arch = "x86_64", target_feature = "avx"))] {
-        /// Best (i.e. fastest) [`MemZeroizer`] available for the target.
-        ///
-        /// Which [`MemZeroizer`] this is is an implementation detail, can depend on the target and
-        /// the selected features and the version of this library.
-        pub type DefaultMemZeroizer = X86_64AvxZeroizer;
-        pub(crate) use X86_64AvxZeroizer as DefaultMemZeroizerConstructor;
-    } else if #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] {
-        /// Best (i.e. fastest) [`MemZeroizer`] available for the target.
-        ///
-        /// Which [`MemZeroizer`] this is is an implementation detail, can depend on the target and
-        /// the selected features and the version of this library.
-        pub type DefaultMemZeroizer = X86_64Sse2Zeroizer;
-        pub(crate) use X86_64Sse2Zeroizer as DefaultMemZeroizerConstructor;
     } else {
-        /// Best (i.e. fastest) [`MemZeroizer`] available for the target.
-        ///
-        /// Which [`MemZeroizer`] this is is an implementation detail, can depend on the target and
-        /// the selected features and the version of this library.
-        pub type DefaultMemZeroizer = VolatileWrite8Zeroizer;
-        pub(crate) use VolatileWrite8Zeroizer as DefaultMemZeroizerConstructor;
+        pub type DefaultMemZeroizer = MemsetAsmBarierZeroizer;
+        pub(crate) use MemsetAsmBarierZeroizer as DefaultMemZeroizerConstructor;
     }
 }
 
@@ -118,12 +84,6 @@ pub(crate) use VolatileWrite8Zeroizer as TestZeroizer;
 /// This zeroizer uses the volatile memset intrinsic which does not
 /// yet have a stable counterpart. It should be very fast, but requires
 /// nightly.
-///
-/// In addition to the volatile write we place a compiler fence right next to
-/// the volatile write. This should not be necessary for secure zeroization
-/// since the volatile semantics guarenties our writes are not elided, and they
-/// can not be delayed since we are deallocating the memory after zeroization.
-/// The use of this fence is therefore only a precaution.
 #[cfg(feature = "nightly_core_intrinsics")]
 #[derive(Debug, Copy, Clone, Default)]
 pub struct VolatileMemsetZeroizer;
@@ -133,124 +93,41 @@ impl MemZeroizer for VolatileMemsetZeroizer {
     unsafe fn zeroize_mem_blocks<const A: u8, const B: u8>(&self, ptr: *mut u8, len: usize) {
         precondition_memory_range!(ptr, len);
         debug_precondition_logaligned!(A, ptr);
-        // SAFETY: the caller must uphold the safety contract of
-        // `internals::volatile_memset`
-        unsafe {
-            internals::volatile_memset(ptr, 0, len);
-        }
-        fence();
-    }
-}
-
-/// This zeroizer uses volatile zeroization functions provided by libc.
-/// It should be fast but is only available on certain platforms.
-///
-/// In addition to the volatile write we place a compiler fence right next to
-/// the volatile write. This should not be necessary for secure zeroization
-/// since the volatile semantics guarenties our writes are not elided, and they
-/// can not be delayed since we are deallocating the memory after zeroization.
-/// The use of this fence is therefore only a precaution.
-#[cfg(any(
-    target_os = "freebsd",
-    target_os = "dragonfly",
-    target_os = "openbsd",
-    target_os = "netbsd",
-    target_os = "macos",
-    target_os = "ios",
-    target_env = "gnu",
-    target_env = "musl"
-))]
-#[derive(Debug, Copy, Clone, Default)]
-pub struct LibcZeroizer;
-
-#[cfg(any(
-    target_os = "freebsd",
-    target_os = "dragonfly",
-    target_os = "openbsd",
-    target_os = "netbsd",
-    target_os = "macos",
-    target_os = "ios",
-    target_env = "gnu",
-    target_env = "musl"
-))]
-impl MemZeroizer for LibcZeroizer {
-    unsafe fn zeroize_mem_blocks<const A: u8, const B: u8>(&self, ptr: *mut u8, len: usize) {
-        precondition_memory_range!(ptr, len);
-        debug_precondition_logaligned!(A, ptr);
-        debug_precondition_logmultiple!(B, len);
-        // SAFETY: the caller must uphold the safety contract of
-        // `internals::libc_explicit_bzero`
-        unsafe {
-            internals::libc_explicit_bzero(ptr, len);
-        }
-        fence();
-    }
-}
-
-/// This zeroizer uses volatile assembly (`rep stosb`) for modern x86_64,
-/// performing very well for large amounts of memory. To make this available on
-/// stable, it uses a C compiler at build time.
-///
-/// In addition to the volatile write we place a compiler fence right next to
-/// the volatile write. This should not be necessary for secure zeroization
-/// since the volatile semantics guarenties our writes are not elided, and they
-/// can not be delayed since we are deallocating the memory after zeroization.
-/// The use of this fence is therefore only a precaution.
-#[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-#[derive(Debug, Copy, Clone, Default)]
-pub struct AsmRepStosZeroizer;
-
-#[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-impl MemZeroizer for AsmRepStosZeroizer {
-    unsafe fn zeroize_mem_blocks<const A: u8, const B: u8>(&self, ptr: *mut u8, len: usize) {
-        precondition_memory_range!(ptr, len);
-        debug_precondition_logaligned!(A, ptr);
-        debug_precondition_logmultiple!(B, len);
-        // SAFETY: the caller must uphold the safety contract of
-        // `internals::asm_ermsb_zeroize`
+        // SAFETY: the caller must uphold the safety contract
         unsafe {
-            internals::asm_ermsb_zeroize(ptr, len);
+            core::intrinsics::volatile_set_memory(ptr, 0, len);
         }
-        fence();
     }
 }
 
-/// This zeroizer uses a volatile write per byte. This zeroization technique is
-/// similar to the `zeroize` crate, available for all target platforms on
-/// stable, but extremely slow.
-///
-/// In addition to the volatile write we place a compiler fence right next to
-/// the volatile write. This should not be necessary for secure zeroization
-/// since the volatile semantics guarenties our writes are not elided, and they
-/// can not be delayed since we are deallocating the memory after zeroization.
-/// The use of this fence is therefore only a precaution.
+/// This zeroizer uses a non-volatile memset, followed by an empty asm block
+/// acting as an optimisation barier. It should be very fast, and according to
+/// my current understanding of the op.sem. the compiler is not allowed to
+/// remove the writes.
 #[derive(Debug, Copy, Clone, Default)]
-pub struct VolatileWriteZeroizer;
+pub struct MemsetAsmBarierZeroizer;
 
-impl MemZeroizer for VolatileWriteZeroizer {
+impl MemZeroizer for MemsetAsmBarierZeroizer {
     unsafe fn zeroize_mem_blocks<const A: u8, const B: u8>(&self, ptr: *mut u8, len: usize) {
         precondition_memory_range!(ptr, len);
         debug_precondition_logaligned!(A, ptr);
-        debug_precondition_logmultiple!(B, len);
-        // SAFETY: the caller must uphold the safety contract of
-        // `volatile_write_zeroize_mem`
+        // SAFETY: the caller must uphold the safety contract of `write_bytes`
+        unsafe { ptr.write_bytes(0, len) };
+        // Optimisation barier, so the writes can not be optimised out
         unsafe {
-            internals::volatile_write_zeroize(ptr, len);
-        }
-        fence();
+            core::arch::asm!(
+                "/* {0} */",
+                in(reg) ptr,
+                options(nostack, readonly, preserves_flags),
+            )
+        };
     }
 }
 
 /// This zeroizer uses a volatile write per 8 bytes if the pointer is 8 byte
 /// aligned, and otherwise uses `VolatileWriteZeroizer`. This zeroization
-/// technique is available for all target platforms on stable, but not very
-/// fast.
-///
-/// In addition to the volatile write we place a compiler fence right next to
-/// the volatile write. This should not be necessary for secure zeroization
-/// since the volatile semantics guarenties our writes are not elided, and they
-/// can not be delayed since we are deallocating the memory after zeroization.
-/// The use of this fence is therefore only a precaution.
+/// technique is pure Rust and available for all target platforms on stable, but
+/// not very fast.
 ///
 /// This zeroization method can benefit (in terms of performance) from using the
 /// [`MemZeroizer::zeroize_mem_blocks`] function instead of
@@ -279,132 +156,8 @@ impl MemZeroizer for VolatileWrite8Zeroizer {
                 internals::volatile_write_zeroize(ptr, len);
             }
         }
-        fence();
-    }
-}
-
-/// This zeroizer uses inline asm with avx2 instructions if the pointer is 32
-/// byte aligned, and otherwise uses `VolatileWrite8Zeroizer`. This zeroization
-/// technique is available for x86_64 platforms with avx2 cpu support on stable,
-/// and reasonably fast for 32 byte aligned pointers.
-///
-/// In addition to the volatile write we place a compiler fence right next to
-/// the volatile write. This should not be necessary for secure zeroization
-/// since the volatile semantics guarenties our writes are not elided, and they
-/// can not be delayed since we are deallocating the memory after zeroization.
-/// The use of this fence is therefore only a precaution.
-///
-/// This zeroization method can benefit (in terms of performance) from using the
-/// [`MemZeroizer::zeroize_mem_blocks`] function instead of
-/// [`MemZeroizer::zeroize_mem`] function if a minimum alignment is known
-/// at compile time.
-#[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
-#[derive(Debug, Copy, Clone, Default)]
-pub struct X86_64AvxZeroizer;
-
-#[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
-impl MemZeroizer for X86_64AvxZeroizer {
-    unsafe fn zeroize_mem_blocks<const A: u8, const B: u8>(&self, mut ptr: *mut u8, len: usize) {
-        precondition_memory_range!(ptr, len);
-        debug_precondition_logaligned!(A, ptr);
-        debug_precondition_logmultiple!(B, len);
-        // if we have 32 = 2^5 byte alignment then write 32 bytes at a time,
-        // with 8 = 2^3 byte align do 8 bytes at a time, otherwise 1 byte at a time
-        if (A >= 5) | is_aligned_ptr_mut(ptr, 32) {
-            // SAFETY: `ptr` is 32 byte aligned
-            ptr = unsafe { internals::x86_64_simd32_unroll2_zeroize_align32_block32(ptr, len) };
-            // zeroize tail
-            if B < 5 {
-                ptr = unsafe { internals::zeroize_align8_block8(ptr, len % 32) };
-            }
-            if B < 3 {
-                unsafe { internals::zeroize_align4_tail8(ptr, len % 8) };
-            }
-        } else if (A >= 3) | is_aligned_ptr_mut(ptr, 8) {
-            // SAFETY: `ptr` is 8 byte aligned
-            ptr = unsafe { internals::zeroize_align8_block8(ptr, len) };
-            if B < 3 {
-                unsafe { internals::zeroize_align4_tail8(ptr, len % 8) };
-            }
-        } else {
-            // SAFETY: no alignment requirement
-            unsafe {
-                internals::volatile_write_zeroize(ptr, len);
-            }
-        }
-        fence();
-    }
-}
-
-/// This zeroizer uses inline asm with sse2 instructions if the pointer is 16
-/// byte aligned, and otherwise uses `VolatileWrite8Zeroizer`. This zeroization
-/// technique is available for x86_64 platforms with sse2 cpu support on stable,
-/// and reasonably fast for 16 byte aligned pointers.
-///
-/// In addition to the volatile write we place a compiler fence right next to
-/// the volatile write. This should not be necessary for secure zeroization
-/// since the volatile semantics guarenties our writes are not elided, and they
-/// can not be delayed since we are deallocating the memory after zeroization.
-/// The use of this fence is therefore only a precaution.
-///
-/// This zeroization method can benefit (in terms of performance) from using the
-/// [`MemZeroizer::zeroize_mem_blocks`] function instead of
-/// [`MemZeroizer::zeroize_mem`] function if a minimum alignment is known
-/// at compile time.
-#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-#[derive(Debug, Copy, Clone, Default)]
-pub struct X86_64Sse2Zeroizer;
-
-#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-impl MemZeroizer for X86_64Sse2Zeroizer {
-    unsafe fn zeroize_mem_blocks<const A: u8, const B: u8>(&self, mut ptr: *mut u8, len: usize) {
-        precondition_memory_range!(ptr, len);
-        debug_precondition_logaligned!(A, ptr);
-        debug_precondition_logmultiple!(B, len);
-        // if we have 16 = 2^4 byte alignment then write 16 bytes at a time,
-        // with 8 = 2^3 byte align do 8 bytes at a time, otherwise 1 byte at a time
-        if (A >= 4) | is_aligned_ptr_mut(ptr, 16) {
-            // SAFETY: `ptr` is 16 byte aligned
-
-            ptr = unsafe { internals::x86_64_simd16_unroll2_zeroize_align16_block16(ptr, len) };
-            // zeroize tail
-            if B < 4 {
-                ptr = unsafe { internals::zeroize_align8_block8(ptr, len % 16) };
-            }
-            if B < 3 {
-                unsafe { internals::zeroize_align4_tail8(ptr, len % 8) };
-            }
-        } else if (A >= 3) | is_aligned_ptr_mut(ptr, 8) {
-            // SAFETY: `ptr` is 8 byte aligned
-            ptr = unsafe { internals::zeroize_align8_block8(ptr, len) };
-            if B < 3 {
-                unsafe { internals::zeroize_align4_tail8(ptr, len % 8) };
-            }
-        } else {
-            // SAFETY: no alignment requirement
-            unsafe {
-                internals::volatile_write_zeroize(ptr, len);
-            }
-        }
-        fence();
     }
 }
 
-/// Compiler fence.
-///
-/// Forces sequentially consistent access across this fence at compile time. At
-/// runtime the CPU can still reorder memory accesses. This should not be
-/// necessary for secure zeroization since the volatile semantics guaranties our
-/// writes are not elided, and they can not be delayed since we are deallocating
-/// the memory after zeroization. The use of this fence is therefore only a
-/// precaution. For the same reasons it probably does not add security, it also
-/// probably does not hurt performance significantly.
-#[inline]
-fn fence() {
-    use core::sync::atomic::{compiler_fence, Ordering};
-
-    compiler_fence(Ordering::SeqCst);
-}
-
 #[cfg(test)]
 mod tests;
diff --git a/src/zeroize/tests.rs b/src/zeroize/tests.rs
index 8571cc0..ed0ba44 100644
--- a/src/zeroize/tests.rs
+++ b/src/zeroize/tests.rs
@@ -33,98 +33,18 @@ fn test_b127_volatile_memset_zeroizer() {
     test_b127_zeroizer(VolatileMemsetZeroizer);
 }
 
-#[cfg(any(
-    target_os = "freebsd",
-    target_os = "dragonfly",
-    target_os = "openbsd",
-    target_os = "netbsd",
-    target_os = "macos",
-    target_os = "ios",
-    target_env = "gnu",
-    target_env = "musl"
-))]
-#[test]
-#[cfg_attr(miri, ignore)] // ffi
-fn test_b127_libc_zeroizer() {
-    test_b127_zeroizer(LibcZeroizer);
-}
-
-#[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-#[test]
-#[cfg_attr(miri, ignore)] // ffi, asm
-fn test_b127_asm_rep_stos_zeroizer() {
-    test_b127_zeroizer(AsmRepStosZeroizer);
-}
-
-#[test]
-fn test_b127_volatile_write_zeroizer() {
-    test_b127_zeroizer(VolatileWriteZeroizer);
-}
-
 #[test]
 fn test_b127_volatile_write8_zeroizer() {
     test_b127_zeroizer(VolatileWrite8Zeroizer);
 }
 
-#[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
-#[test]
-fn test_b127_x86_64_avx_zeroizer() {
-    test_b127_zeroizer(X86_64AvxZeroizer);
-}
-
-#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-#[test]
-fn test_b127_x86_64_sse2_zeroizer() {
-    test_b127_zeroizer(X86_64Sse2Zeroizer);
-}
-
 #[cfg(feature = "nightly_core_intrinsics")]
 #[test]
 fn test_b239_lowalign_volatile_memset_zeroizer() {
     test_b239_lowalign_zeroizer(VolatileMemsetZeroizer);
 }
 
-#[cfg(any(
-    target_os = "freebsd",
-    target_os = "dragonfly",
-    target_os = "openbsd",
-    target_os = "netbsd",
-    target_os = "macos",
-    target_os = "ios",
-    target_env = "gnu",
-    target_env = "musl"
-))]
-#[test]
-#[cfg_attr(miri, ignore)] // ffi
-fn test_b239_lowalign_libc_zeroizer() {
-    test_b239_lowalign_zeroizer(LibcZeroizer);
-}
-
-#[cfg(all(target_arch = "x86_64", target_feature = "ermsb"))]
-#[test]
-#[cfg_attr(miri, ignore)] // ffi, asm
-fn test_b239_lowalign_asm_rep_stos_zeroizer() {
-    test_b239_lowalign_zeroizer(AsmRepStosZeroizer);
-}
-
-#[test]
-fn test_b239_lowalign_volatile_write_zeroizer() {
-    test_b239_lowalign_zeroizer(VolatileWriteZeroizer);
-}
-
 #[test]
 fn test_b239_lowalign_volatile_write8_zeroizer() {
     test_b239_lowalign_zeroizer(VolatileWrite8Zeroizer);
 }
-
-#[cfg(all(target_arch = "x86_64", target_feature = "avx"))]
-#[test]
-fn test_b239_lowalign_x86_64_avx_zeroizer() {
-    test_b239_lowalign_zeroizer(X86_64AvxZeroizer);
-}
-
-#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-#[test]
-fn test_b239_lowalign_x86_64_sse2_zeroizer() {
-    test_b239_lowalign_zeroizer(X86_64Sse2Zeroizer);
-}