From 87f9976f2a943ff9a7cc0e122324d47027a71514 Mon Sep 17 00:00:00 2001
From: Scott Lamb <slamb@slamb.org>
Date: Thu, 20 Jun 2024 18:21:34 -0700
Subject: [PATCH] support funny sizes

* refine `PlaneDims` logic for uyvy dimension calculation
* relax constraints on `uyvy_to_i420::convert`
* process rows, not just blocks, with processors deferring to a
  fallback. In theory at least the AVX one could use masked instructions
  instead, but the fallback approach was easy.
* test with miri
---
 .github/workflows/push.yml |  16 ++
 README.md                  |  10 +-
 src/frame.rs               |  57 ++--
 src/lib.rs                 |  69 ++++-
 src/uyvy_to_i420.rs        | 517 ++++++++++++++++++++++++++-----------
 5 files changed, 477 insertions(+), 192 deletions(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 01cd156..ff74128 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -12,6 +12,7 @@ jobs:
     runs-on: ${{ matrix.runner }}
 
     strategy:
+      fail-fast: false
       matrix:
         runner: [ubuntu-latest, macos-latest]
     steps:
@@ -31,3 +32,18 @@ jobs:
     - run: cargo clippy -- --deny warnings
     - run: cargo test --release
     - run: cargo criterion
+
+  miri:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Setup Rust
+      uses: actions-rust-lang/setup-rust-toolchain@v1
+      with:
+        toolchain: nightly
+        components: miri
+    - name: Cache
+      uses: Swatinem/rust-cache@v2
+      with:
+        key: miri
+    - run: cargo miri test
\ No newline at end of file
diff --git a/README.md b/README.md
index e78f148..e485678 100644
--- a/README.md
+++ b/README.md
@@ -14,14 +14,10 @@ Limitations and future work:
 *   Returns `Err` on x86\_64 CPUs that don't support
     AVX2. We'll likely add an SSE2 fallback later. As SSE2 is in the core
     x86\_64 instruction set, this would mean all x86\_64 CPUs would be supported.
-*   Returns `Err` for frame widths that aren't a multiple
-    of 64 pixels (for AVX2) or 32 pixels (for NEON). This could be eased via a
-    scalar fallback path for the remaining pixels, or (for AVX2) masked
-    load/store instructions.
-*   Returns `Err` for frame heights that aren't a multiple of 2.
 *   Expects to process full horizontal lines. This is likely to
-    change to allow working on cropped regions and outputting to frames with
-    extra padding between lines as required by some APIs/devices.
+    change to allow working on cropped regions.
+*   Does not support output to a frame with padding, as required by some
+*   APIs/devices.
 *   The ARM NEON code is less optimized than the AVX2 code today.
 
 You may find the notes in [`docs/simd.md`](docs/simd.md) helpful if you are new
diff --git a/src/frame.rs b/src/frame.rs
index 69a3d51..ad36b15 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -52,14 +52,6 @@ pub unsafe trait Frame {
     /// Returns true if this frame has been fully initialized.
     fn initialized(&self) -> bool;
 
-    /// Marks this frame as fully initialized.
-    ///
-    /// # Safety
-    ///
-    /// The caller must ensure that the frame is fully initialized, including
-    /// any padding bytes.
-    unsafe fn initialize(&mut self);
-
     /// Returns the (image format-defined) planes for read/shared access.
     fn planes(&self) -> ArrayVec<FramePlaneRef, MAX_PLANES>;
 }
@@ -74,6 +66,14 @@ pub unsafe trait Frame {
 pub unsafe trait FrameMut: Frame {
     /// Returns the (image format-defined) planes for mutation/exclusive access.
     fn planes_mut(&mut self) -> ArrayVec<FramePlaneMut, MAX_PLANES>;
+
+    /// Marks this frame as fully initialized.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that the frame is fully initialized, including
+    /// any padding bytes.
+    unsafe fn initialize(&mut self);
 }
 
 /// Provides read-only access to a given image plane.
@@ -247,6 +247,16 @@ pub unsafe trait Storage {
 
     /// Returns a raw pointer to the start of the storage.
     fn as_ptr(&self) -> *const u8;
+}
+
+/// Write access to a backing buffer for a [`ConsecutiveFrame`].
+///
+/// # Safety
+///
+/// As in [`Storage`].
+pub unsafe trait StorageMut: Storage {
+    /// Returns a raw pointer to the start of the storage.
+    fn as_mut_ptr(&mut self) -> *mut u8;
 
     /// Notes that this storage is initialized, up to length `len`.
     ///
@@ -262,16 +272,6 @@ pub unsafe trait Storage {
     unsafe fn initialize(&mut self, len: usize) {}
 }
 
-/// Write access to a backing buffer for a [`ConsecutiveFrame`].
-///
-/// # Safety
-///
-/// As in [`Storage`].
-pub unsafe trait StorageMut: Storage {
-    /// Returns a raw pointer to the start of the storage.
-    fn as_mut_ptr(&mut self) -> *mut u8;
-}
-
 unsafe impl Storage for Vec<u8> {
     #[inline]
     fn check_len(&self, len: usize) -> bool {
@@ -289,6 +289,11 @@ unsafe impl StorageMut for Vec<u8> {
     fn as_mut_ptr(&mut self) -> *mut u8 {
         self.as_mut_ptr()
     }
+
+    #[inline]
+    unsafe fn initialize(&mut self, len: usize) {
+        unsafe { self.set_len(len) };
+    }
 }
 
 macro_rules! impl_slice_storage {
@@ -478,14 +483,6 @@ unsafe impl<S: Storage> Frame for ConsecutiveFrame<S> {
         }
         planes
     }
-
-    #[inline]
-    unsafe fn initialize(&mut self) {
-        if !self.initialized {
-            self.storage.initialize(self.total_size());
-            self.initialized = true;
-        }
-    }
 }
 
 unsafe impl<S: StorageMut> FrameMut for ConsecutiveFrame<S> {
@@ -506,4 +503,12 @@ unsafe impl<S: StorageMut> FrameMut for ConsecutiveFrame<S> {
         }
         planes
     }
+
+    #[inline]
+    unsafe fn initialize(&mut self) {
+        if !self.initialized {
+            self.storage.initialize(self.total_size());
+            self.initialized = true;
+        }
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 303b866..320d086 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -40,11 +40,17 @@ pub enum PixelFormat {
     /// [UYVY](https://fourcc.org/pixel-format/yuv-uyvy/).
     ///
     /// Matches ffmpeg's `AV_PIX_FMT_UYVY422`: "packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1".
+    ///
+    /// For odd-width images, the width is rounded up to the next multiple of 2,
+    /// with the final `Y` as a don't-care byte, and the final chroma values not
+    /// subsampled.
     UYVY422,
 
     /// [I420](https://fourcc.org/pixel-format/yuv-i420/).
     ///
     /// Matches ffmpeg's `AV_PIX_FMT_YUV420P`: "planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)".
+    ///
+    /// For odd-width and odd-height images, the final pixel is not subsampled.
     I420,
 
     /// BGRA.
@@ -81,7 +87,11 @@ impl PixelFormat {
         match self {
             PixelFormat::UYVY422 => {
                 sizes.push(PlaneDims {
-                    stride: width.checked_shl(1).expect("stride should not overflow"),
+                    // Round to next multiple of 2, then double.
+                    stride: width
+                        .checked_add(width & 1)
+                        .and_then(|w| w.checked_shl(1))
+                        .expect("stride should not overflow"),
                     rows: height,
                 });
             }
@@ -91,8 +101,8 @@ impl PixelFormat {
                     stride: width,
                     rows: height,
                 });
+                // U/V planes.
                 let chroma_plane_size = PlaneDims {
-                    // U/V planes.
                     // Overflow-safe divide by two that rounds up.
                     stride: (width >> 1) + (width & 1),
                     rows: (height >> 1) + (height & 1),
@@ -120,3 +130,58 @@ impl PixelFormat {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn odd_sizes() {
+        assert_eq!(
+            super::PixelFormat::UYVY422
+                .min_plane_dims(1, 1)
+                .collect::<Vec<_>>(),
+            vec![super::PlaneDims { stride: 4, rows: 1 }]
+        );
+        assert_eq!(
+            super::PixelFormat::UYVY422
+                .min_plane_dims(2, 2)
+                .collect::<Vec<_>>(),
+            vec![super::PlaneDims { stride: 4, rows: 2 }]
+        );
+        assert_eq!(
+            super::PixelFormat::UYVY422
+                .min_plane_dims(3, 3)
+                .collect::<Vec<_>>(),
+            vec![super::PlaneDims { stride: 8, rows: 3 }]
+        );
+        assert_eq!(
+            super::PixelFormat::I420
+                .min_plane_dims(1, 1)
+                .collect::<Vec<_>>(),
+            vec![
+                super::PlaneDims { stride: 1, rows: 1 },
+                super::PlaneDims { stride: 1, rows: 1 },
+                super::PlaneDims { stride: 1, rows: 1 }
+            ]
+        );
+        assert_eq!(
+            super::PixelFormat::I420
+                .min_plane_dims(2, 2)
+                .collect::<Vec<_>>(),
+            vec![
+                super::PlaneDims { stride: 2, rows: 2 },
+                super::PlaneDims { stride: 1, rows: 1 },
+                super::PlaneDims { stride: 1, rows: 1 }
+            ]
+        );
+        assert_eq!(
+            super::PixelFormat::I420
+                .min_plane_dims(3, 3)
+                .collect::<Vec<_>>(),
+            vec![
+                super::PlaneDims { stride: 3, rows: 3 },
+                super::PlaneDims { stride: 2, rows: 2 },
+                super::PlaneDims { stride: 2, rows: 2 }
+            ]
+        );
+    }
+}
diff --git a/src/uyvy_to_i420.rs b/src/uyvy_to_i420.rs
index 91aefcd..e203102 100644
--- a/src/uyvy_to_i420.rs
+++ b/src/uyvy_to_i420.rs
@@ -18,25 +18,25 @@ use crate::{
     ConversionError,
 };
 
-/// Processes a block `PIXELS` columns wide, 2 rows high.
+/// Processes a block of 2 rows.
 #[doc(hidden)]
-pub trait BlockProcessor: Copy + Clone + Sized + Send + Sync {
-    /// The width of this block in pixels.
-    const PIXELS: usize;
-
+pub trait RowProcessor: Copy + Clone + Sized + Send + Sync {
     /// Returns true if this block type is supported on this machine.
     fn new() -> Result<Self, ConversionError>;
 
-    /// Processes a block `PIXELS` wide, two rows high.
+    /// Processes a block `width` pixels wide, two rows high.
     ///
     /// # Safety
     ///
     /// Caller must ensure the following:
-    /// * `top_uyvy_addr` and `bot_uyvy_addr` each contain `2 * PIXELS` bytes of initialized data.
-    /// * `top_y_addr` and `bot_y_addr` are each valid destinations for `PIXELS` bytes.
-    /// * `u_addr` and `v_addr` are each valid destinations for `PIXELS / 2` bytes.
+    /// * `top_uyvy_addr` and `bot_uyvy_addr` each contain `2 * width` bytes of initialized data.
+    /// * `top_y_addr` and `bot_y_addr` are each valid destinations for `width` bytes.
+    ///   They may however alias each other, which would not be allowed with `&mut`.
+    /// * `u_addr` and `v_addr` are each valid destinations for `(width + 1) / 2` bytes.
+    #[allow(clippy::too_many_arguments)]
     unsafe fn process(
         self,
+        width: usize,
         top_uyvy_addr: *const u8,
         bot_uyvy_addr: *const u8,
         top_y_addr: *mut u8,
@@ -64,7 +64,7 @@ pub fn convert<FI: Frame, FO: FrameMut>(
 }
 
 #[doc(hidden)]
-pub fn convert_with<P: BlockProcessor, FI: Frame, FO: FrameMut>(
+pub fn convert_with<P: RowProcessor, FI: Frame, FO: FrameMut>(
     uyvy_in: &FI,
     yuv_out: &mut FO,
 ) -> Result<(), ConversionError> {
@@ -76,10 +76,6 @@ pub fn convert_with<P: BlockProcessor, FI: Frame, FO: FrameMut>(
     {
         return Err(ConversionError("invalid arguments"));
     }
-    if width % P::PIXELS != 0 || height % 2 != 0 {
-        // TODO: support irregular sizes.
-        return Err(ConversionError("irregular sizes unsupported"));
-    }
     let p = P::new()?;
     let pixels = width * height;
     let uyvy_planes = uyvy_in.planes();
@@ -90,31 +86,53 @@ pub fn convert_with<P: BlockProcessor, FI: Frame, FO: FrameMut>(
     let [y_out, u_out, v_out] = &mut yuv_planes[..] else {
         panic!("yuv_out must have three planes");
     };
-    if y_out.stride() != width || u_out.stride() != width / 2 || v_out.stride() != width / 2 {
-        // TODO: support padding.
-        return Err(ConversionError("padding unsupported"));
+    let chroma_width = (width >> 1) + (width & 1);
+    let chroma_rows = (height >> 1) + (height & 1);
+    let chroma_size = chroma_width * chroma_rows;
+    if y_out.stride() != width || u_out.stride() != chroma_width || v_out.stride() != chroma_width {
+        return Err(ConversionError("output padding unsupported")); // TODO
     }
-    assert_eq!(uyvy_in.len(), pixels * 2);
+    let uyvy_stride = uyvy_in.stride();
+    assert!(uyvy_stride >= width * 2);
+    assert!(uyvy_in.len() >= height * uyvy_stride - (uyvy_stride - width * 2));
     let uyvy_in = uyvy_in.as_ptr();
     assert_eq!(y_out.len(), pixels);
-    assert_eq!(u_out.len(), pixels / 4);
-    assert_eq!(v_out.len(), pixels / 4);
-    let y_out = y_out.as_mut_ptr().cast::<u8>();
-    let uyvy_row_stride = 2 * width; // TODO: support line padding?
-    let u_out = u_out.as_mut_ptr().cast::<u8>();
-    let v_out = v_out.as_mut_ptr().cast::<u8>();
-    for r in (0..height).step_by(2) {
-        for c in (0..width).step_by(P::PIXELS) {
-            unsafe {
-                p.process(
-                    uyvy_in.add(r * uyvy_row_stride + 2 * c),
-                    uyvy_in.add((r + 1) * uyvy_row_stride + 2 * c),
-                    y_out.add(r * width + c),
-                    y_out.add((r + 1) * width + c),
-                    u_out.add((r >> 1) * (width >> 1) + (c >> 1)),
-                    v_out.add((r >> 1) * (width >> 1) + (c >> 1)),
-                );
-            }
+    assert_eq!(u_out.len(), chroma_size);
+    assert_eq!(v_out.len(), chroma_size);
+    let y_out = y_out.as_mut_ptr();
+    let u_out = u_out.as_mut_ptr();
+    let v_out = v_out.as_mut_ptr();
+    let mut r = 0;
+    loop {
+        if r + 2 > height {
+            break;
+        }
+        unsafe {
+            p.process(
+                width,
+                uyvy_in.add(r * uyvy_stride),
+                uyvy_in.add((r + 1) * uyvy_stride),
+                y_out.add(r * width),
+                y_out.add((r + 1) * width),
+                u_out.add((r >> 1) * chroma_width),
+                v_out.add((r >> 1) * chroma_width),
+            );
+        }
+        r += 2;
+    }
+    if r < height {
+        // Process the last row, without subsampling vertically.
+        unsafe {
+            // XXX: use p.process.
+            fallback(
+                width,
+                uyvy_in.add(r * uyvy_stride),
+                uyvy_in.add(r * uyvy_stride), // aliased!
+                y_out.add(r * width),
+                y_out.add(r * width), // aliased!
+                u_out.add((r >> 1) * chroma_width),
+                v_out.add((r >> 1) * chroma_width),
+            );
         }
     }
     drop(yuv_planes);
@@ -136,15 +154,44 @@ unsafe fn hexprint(v: std::arch::x86_64::__m256i) -> impl std::fmt::Display {
     )
 }
 
+unsafe fn fallback(
+    width: usize,
+    top_uyvy_addr: *const u8,
+    bot_uyvy_addr: *const u8,
+    top_y_addr: *mut u8,
+    bot_y_addr: *mut u8,
+    u_addr: *mut u8,
+    v_addr: *mut u8,
+) {
+    for i in 0..width {
+        std::ptr::write(
+            top_y_addr.add(i),
+            std::ptr::read(top_uyvy_addr.add(2 * i + 1)),
+        );
+        std::ptr::write(
+            bot_y_addr.add(i),
+            std::ptr::read(bot_uyvy_addr.add(2 * i + 1)),
+        );
+    }
+    let avg = |a: u8, b: u8| ((u16::from(a) + u16::from(b) + 1) >> 1) as u8;
+    let chroma_width = (width >> 1) + (width & 1);
+    for i in 0..chroma_width {
+        let top_u = std::ptr::read(top_uyvy_addr.add(4 * i));
+        let bot_u = std::ptr::read(bot_uyvy_addr.add(4 * i));
+        let top_v = std::ptr::read(top_uyvy_addr.add(4 * i + 2));
+        let bot_v = std::ptr::read(bot_uyvy_addr.add(4 * i + 2));
+        std::ptr::write(u_addr.add(i), avg(top_u, bot_u));
+        std::ptr::write(v_addr.add(i), avg(top_v, bot_v));
+    }
+}
+
 #[cfg(target_arch = "x86_64")]
 #[doc(hidden)]
 #[derive(Copy, Clone)]
 pub struct ExplicitAvx2DoubleBlock(());
 
 #[cfg(target_arch = "x86_64")]
-impl BlockProcessor for ExplicitAvx2DoubleBlock {
-    const PIXELS: usize = 64;
-
+impl RowProcessor for ExplicitAvx2DoubleBlock {
     #[inline]
     fn new() -> Result<Self, ConversionError> {
         if is_x86_feature_detected!("avx2") {
@@ -158,6 +205,7 @@ impl BlockProcessor for ExplicitAvx2DoubleBlock {
     #[inline(never)]
     unsafe fn process(
         self,
+        width: usize,
         top_uyvy_addr: *const u8,
         bot_uyvy_addr: *const u8,
         top_y_addr: *mut u8,
@@ -171,41 +219,68 @@ impl BlockProcessor for ExplicitAvx2DoubleBlock {
             1, 3, 5, 7, 9, 11, 13, 15, // lower half: 8 Y components.
             0, 4, 8, 12, 2, 6, 10, 14, // upper half: (4 * U), (4 * V).
         ));
-        let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 4] {
-            std::array::from_fn(|i| {
-                // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
-                let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _);
-                // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12.
-                x86_64::_mm256_shuffle_epi8(raw, shuf_indices)
-            })
-        });
-        for (data, addr) in [(top, top_y_addr), (bot, bot_y_addr)] {
-            for i in [0, 1] {
-                // Put into 64-groups: y0 y2 y1 y3.
-                // VPUNPCKLQDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12
-                let swapped = x86_64::_mm256_unpacklo_epi64(data[2 * i], data[2 * i + 1]);
-
-                // Swap y2 and y1 to produce: y0 y1 y2 y3.
-                // VPERMQ (YMM, YMM, I8) on Zen2: lat 6, cpi 1.27
-                x86_64::_mm256_storeu_si256(
-                    addr.add(32 * i) as _,
-                    x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(swapped),
-                );
+
+        // Process the nice blocks.
+        const BLOCK_SIZE: usize = 64;
+        let mut i = 0;
+        loop {
+            let top_uyvy_addr = top_uyvy_addr.add(2 * i);
+            let bot_uyvy_addr = bot_uyvy_addr.add(2 * i);
+            let top_y_addr = top_y_addr.add(i);
+            let bot_y_addr = bot_y_addr.add(i);
+            let u_addr = u_addr.add(i / 2);
+            let v_addr = v_addr.add(i / 2);
+            if i + BLOCK_SIZE > width {
+                break;
+            }
+            let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 4] {
+                std::array::from_fn(|i| {
+                    // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
+                    let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _);
+                    // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12.
+                    x86_64::_mm256_shuffle_epi8(raw, shuf_indices)
+                })
+            });
+            for (data, addr) in [(top, top_y_addr), (bot, bot_y_addr)] {
+                for i in [0, 1] {
+                    // Put into 64-groups: y0 y2 y1 y3.
+                    // VPUNPCKLQDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12
+                    let swapped = x86_64::_mm256_unpacklo_epi64(data[2 * i], data[2 * i + 1]);
+
+                    // Swap y2 and y1 to produce: y0 y1 y2 y3.
+                    // VPERMQ (YMM, YMM, I8) on Zen2: lat 6, cpi 1.27
+                    x86_64::_mm256_storeu_si256(
+                        addr.add(32 * i) as _,
+                        x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(swapped),
+                    );
+                }
             }
+            let uv: [_; 2] = std::array::from_fn(|i| {
+                // unpackhi_epi32(data[0], data[1]) returns (u0 u2 v0 v2) (u1 u3 v1 v3).
+                // VPUNPCKHDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12
+                x86_64::_mm256_avg_epu8(
+                    x86_64::_mm256_unpackhi_epi32(top[2 * i], top[2 * i + 1]),
+                    x86_64::_mm256_unpackhi_epi32(bot[2 * i], bot[2 * i + 1]),
+                )
+            });
+            let mix = x86_64::_mm256_permute2x128_si256::<0b_00_11_00_01>(uv[0], uv[1]);
+            let uv0prime =
+                x86_64::_mm256_inserti128_si256::<1>(uv[0], x86_64::_mm256_castsi256_si128(uv[1]));
+            x86_64::_mm256_storeu_si256(u_addr as _, x86_64::_mm256_unpacklo_epi32(uv0prime, mix));
+            x86_64::_mm256_storeu_si256(v_addr as _, x86_64::_mm256_unpackhi_epi32(uv0prime, mix));
+            i += BLOCK_SIZE;
+        }
+        if i < width {
+            fallback(
+                width - i,
+                top_uyvy_addr.add(2 * i),
+                bot_uyvy_addr.add(2 * i),
+                top_y_addr.add(i),
+                bot_y_addr.add(i),
+                u_addr.add(i / 2),
+                v_addr.add(i / 2),
+            );
         }
-        let uv: [_; 2] = std::array::from_fn(|i| {
-            // unpackhi_epi32(data[0], data[1]) returns (u0 u2 v0 v2) (u1 u3 v1 v3).
-            // VPUNPCKHDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12
-            x86_64::_mm256_avg_epu8(
-                x86_64::_mm256_unpackhi_epi32(top[2 * i], top[2 * i + 1]),
-                x86_64::_mm256_unpackhi_epi32(bot[2 * i], bot[2 * i + 1]),
-            )
-        });
-        let mix = x86_64::_mm256_permute2x128_si256::<0b_00_11_00_01>(uv[0], uv[1]);
-        let uv0prime =
-            x86_64::_mm256_inserti128_si256::<1>(uv[0], x86_64::_mm256_castsi256_si128(uv[1]));
-        x86_64::_mm256_storeu_si256(u_addr as _, x86_64::_mm256_unpacklo_epi32(uv0prime, mix));
-        x86_64::_mm256_storeu_si256(v_addr as _, x86_64::_mm256_unpackhi_epi32(uv0prime, mix));
     }
 }
 
@@ -215,9 +290,7 @@ impl BlockProcessor for ExplicitAvx2DoubleBlock {
 pub struct ExplicitAvx2SingleBlock(());
 
 #[cfg(target_arch = "x86_64")]
-impl BlockProcessor for ExplicitAvx2SingleBlock {
-    const PIXELS: usize = 32;
-
+impl RowProcessor for ExplicitAvx2SingleBlock {
     #[inline]
     fn new() -> Result<Self, ConversionError> {
         if is_x86_feature_detected!("avx2") {
@@ -231,6 +304,7 @@ impl BlockProcessor for ExplicitAvx2SingleBlock {
     #[target_feature(enable = "avx2")]
     unsafe fn process(
         self,
+        width: usize,
         top_uyvy_addr: *const u8,
         bot_uyvy_addr: *const u8,
         top_y_addr: *mut u8,
@@ -244,33 +318,59 @@ impl BlockProcessor for ExplicitAvx2SingleBlock {
             1, 3, 5, 7, 9, 11, 13, 15, // lower half: 8 Y components.
             0, 4, 8, 12, 2, 6, 10, 14, // upper half: (4 * U), (4 * V).
         ));
-        let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 2] {
-            std::array::from_fn(|i| {
-                // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
-                let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _);
-                // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12.
-                x86_64::_mm256_shuffle_epi8(raw, shuf_indices)
-            })
-        });
-        for (data, y_addr) in [(top, top_y_addr), (bot, bot_y_addr)] {
-            let y = x86_64::_mm256_unpacklo_epi64(data[0], data[1]);
-            // VMOVDQU (M256, YMM) on Zen2: ports 1*FP2.
-            x86_64::_mm256_storeu_si256(
-                y_addr as _,
-                x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(y),
+        // Process the nice blocks.
+        const BLOCK_SIZE: usize = 32;
+        let mut i = 0;
+        loop {
+            let top_uyvy_addr = top_uyvy_addr.add(2 * i);
+            let bot_uyvy_addr = bot_uyvy_addr.add(2 * i);
+            let top_y_addr = top_y_addr.add(i);
+            let bot_y_addr = bot_y_addr.add(i);
+            let u_addr = u_addr.add(i / 2);
+            let v_addr = v_addr.add(i / 2);
+            if i + BLOCK_SIZE > width {
+                break;
+            }
+            let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 2] {
+                std::array::from_fn(|i| {
+                    // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
+                    let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _);
+                    // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12.
+                    x86_64::_mm256_shuffle_epi8(raw, shuf_indices)
+                })
+            });
+            for (data, y_addr) in [(top, top_y_addr), (bot, bot_y_addr)] {
+                let y = x86_64::_mm256_unpacklo_epi64(data[0], data[1]);
+                // VMOVDQU (M256, YMM) on Zen2: ports 1*FP2.
+                x86_64::_mm256_storeu_si256(
+                    y_addr as _,
+                    x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(y),
+                );
+            }
+
+            let uv = x86_64::_mm256_avg_epu8(
+                x86_64::_mm256_unpackhi_epi32(top[0], top[1]),
+                x86_64::_mm256_unpackhi_epi32(bot[0], bot[1]),
+            );
+            let p = x86_64::_mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+            x86_64::_mm256_storeu2_m128i(
+                v_addr as _,
+                u_addr as _,
+                x86_64::_mm256_permutevar8x32_epi32(uv, p),
+            );
+            i += BLOCK_SIZE;
+        }
+        if i < width {
+            fallback(
+                width - i,
+                top_uyvy_addr.add(2 * i),
+                bot_uyvy_addr.add(2 * i),
+                top_y_addr.add(i),
+                bot_y_addr.add(i),
+                u_addr.add(i / 2),
+                v_addr.add(i / 2),
             );
         }
-
-        let uv = x86_64::_mm256_avg_epu8(
-            x86_64::_mm256_unpackhi_epi32(top[0], top[1]),
-            x86_64::_mm256_unpackhi_epi32(bot[0], bot[1]),
-        );
-        let p = x86_64::_mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
-        x86_64::_mm256_storeu2_m128i(
-            v_addr as _,
-            u_addr as _,
-            x86_64::_mm256_permutevar8x32_epi32(uv, p),
-        );
     }
 }
 
@@ -280,9 +380,7 @@ impl BlockProcessor for ExplicitAvx2SingleBlock {
 pub struct ExplicitNeon(());
 
 #[cfg(target_arch = "aarch64")]
-impl BlockProcessor for ExplicitNeon {
-    const PIXELS: usize = 32;
-
+impl RowProcessor for ExplicitNeon {
     fn new() -> Result<Self, ConversionError> {
         if std::arch::is_aarch64_feature_detected!("neon") {
             Ok(Self(()))
@@ -295,6 +393,7 @@ impl BlockProcessor for ExplicitNeon {
     #[target_feature(enable = "neon")]
     unsafe fn process(
         self,
+        width: usize,
         top_uyvy_addr: *const u8,
         bot_uyvy_addr: *const u8,
         top_y_addr: *mut u8,
@@ -302,12 +401,43 @@ impl BlockProcessor for ExplicitNeon {
         u_addr: *mut u8,
         v_addr: *mut u8,
     ) {
-        let top_uyvy = aarch64::vld4q_u8(top_uyvy_addr);
-        let bot_uyvy = aarch64::vld4q_u8(bot_uyvy_addr);
-        aarch64::vst2q_u8(top_y_addr, aarch64::uint8x16x2_t(top_uyvy.1, top_uyvy.3));
-        aarch64::vst2q_u8(bot_y_addr, aarch64::uint8x16x2_t(bot_uyvy.1, bot_uyvy.3));
-        aarch64::vst1q_u8(u_addr, aarch64::vrhaddq_u8(top_uyvy.0, bot_uyvy.0));
-        aarch64::vst1q_u8(v_addr, aarch64::vrhaddq_u8(top_uyvy.2, bot_uyvy.2));
+        const BLOCK_SIZE: usize = 32;
+        let mut i = 0;
+        loop {
+            if i + BLOCK_SIZE > width {
+                break;
+            }
+            let top_uyvy = aarch64::vld4q_u8(top_uyvy_addr.add(2 * i));
+            let bot_uyvy = aarch64::vld4q_u8(bot_uyvy_addr.add(2 * i));
+            aarch64::vst2q_u8(
+                top_y_addr.add(i),
+                aarch64::uint8x16x2_t(top_uyvy.1, top_uyvy.3),
+            );
+            aarch64::vst2q_u8(
+                bot_y_addr.add(i),
+                aarch64::uint8x16x2_t(bot_uyvy.1, bot_uyvy.3),
+            );
+            aarch64::vst1q_u8(
+                u_addr.add(i / 2),
+                aarch64::vrhaddq_u8(top_uyvy.0, bot_uyvy.0),
+            );
+            aarch64::vst1q_u8(
+                v_addr.add(i / 2),
+                aarch64::vrhaddq_u8(top_uyvy.2, bot_uyvy.2),
+            );
+            i += BLOCK_SIZE;
+        }
+        if i < width {
+            fallback(
+                width - i,
+                top_uyvy_addr.add(2 * i),
+                bot_uyvy_addr.add(2 * i),
+                top_y_addr.add(i),
+                bot_y_addr.add(i),
+                u_addr.add(i / 2),
+                v_addr.add(i / 2),
+            );
+        }
     }
 }
 
@@ -322,9 +452,7 @@ macro_rules! auto {
         #[derive(Copy, Clone)]
         pub struct $ident<const PIXELS: usize>([(); PIXELS]);
 
-        impl<const PIXELS: usize> BlockProcessor for $ident<PIXELS> {
-            const PIXELS: usize = PIXELS;
-
+        impl<const PIXELS: usize> RowProcessor for $ident<PIXELS> {
             #[inline(always)]
             fn new() -> Result<Self, ConversionError> {
                 if true && $($supported)+ {
@@ -338,6 +466,7 @@ macro_rules! auto {
             $(#[target_feature(enable = $feature)])*
             unsafe fn process(
                 self,
+                width: usize,
                 top_uyvy_addr: *const u8,
                 bot_uyvy_addr: *const u8,
                 top_y_addr: *mut u8,
@@ -345,18 +474,45 @@ macro_rules! auto {
                 u_addr: *mut u8,
                 v_addr: *mut u8,
             ) {
-                for i in 0..PIXELS {
-                    std::ptr::write(top_y_addr.add(i), std::ptr::read(top_uyvy_addr.add(2*i + 1)));
-                    std::ptr::write(bot_y_addr.add(i), std::ptr::read(bot_uyvy_addr.add(2*i + 1)));
+                let mut i = 0;
+
+                // The u/v logic below doesn't handle an odd number of pixels per block.
+                const { assert!(PIXELS % 2 == 0); }
+                loop {
+                    if i + PIXELS > width {
+                        break;
+                    }
+                    let top_uyvy_addr = top_uyvy_addr.add(2 * i);
+                    let bot_uyvy_addr = bot_uyvy_addr.add(2 * i);
+                    let top_y_addr = top_y_addr.add(i);
+                    let bot_y_addr = bot_y_addr.add(i);
+                    let u_addr = u_addr.add(i / 2);
+                    let v_addr = v_addr.add(i / 2);
+                    for j in 0..PIXELS {
+                        std::ptr::write(top_y_addr.add(j), std::ptr::read(top_uyvy_addr.add(2*j + 1)));
+                        std::ptr::write(bot_y_addr.add(j), std::ptr::read(bot_uyvy_addr.add(2*j + 1)));
+                    }
+                    let avg = |a: u8, b: u8| { (u16::from(a) + u16::from(b) + 1 >> 1) as u8 };
+                    for j in 0..PIXELS/2 {
+                        let top_u = std::ptr::read(top_uyvy_addr.add(4*j));
+                        let bot_u = std::ptr::read(bot_uyvy_addr.add(4*j));
+                        let top_v = std::ptr::read(top_uyvy_addr.add(4*j + 2));
+                        let bot_v = std::ptr::read(bot_uyvy_addr.add(4*j + 2));
+                        std::ptr::write(u_addr.add(j), avg(top_u, bot_u));
+                        std::ptr::write(v_addr.add(j), avg(top_v, bot_v));
+                    }
+                    i += PIXELS;
                 }
-                let avg = |a: u8, b: u8| { (u16::from(a) + u16::from(b) + 1 >> 1) as u8 };
-                for i in 0..Self::PIXELS/2 {
-                    let top_u = std::ptr::read(top_uyvy_addr.add(4*i));
-                    let bot_u = std::ptr::read(bot_uyvy_addr.add(4*i));
-                    let top_v = std::ptr::read(top_uyvy_addr.add(4*i + 2));
-                    let bot_v = std::ptr::read(bot_uyvy_addr.add(4*i + 2));
-                    std::ptr::write(u_addr.add(i), avg(top_u, bot_u));
-                    std::ptr::write(v_addr.add(i), avg(top_v, bot_v));
+                if i < width {
+                    fallback(
+                        width - i,
+                        top_uyvy_addr.add(2 * i),
+                        bot_uyvy_addr.add(2 * i),
+                        top_y_addr.add(i),
+                        bot_y_addr.add(i),
+                        u_addr.add(i / 2),
+                        v_addr.add(i / 2),
+                    );
                 }
             }
         }
@@ -385,28 +541,30 @@ auto! {
 
 #[cfg(test)]
 mod tests {
-    macro_rules! test_block {
-        ($processor: ty, $mod: ident) => {
+    macro_rules! test_processor {
+        ($processor: ty, $mod: ident, $pixels: expr) => {
             mod $mod {
-                use super::super::BlockProcessor as _;
+                use super::super::RowProcessor as _;
                 type P = $processor;
 
                 /// Tests that a single `process` call produces the right `y` plane bytes.
                 #[test]
                 fn y() {
                     let p = P::new().unwrap();
-                    let mut top_in = vec![0xff; P::PIXELS * 4];
-                    let mut bot_in = vec![0xff; P::PIXELS * 4];
-                    for i in 0..P::PIXELS {
+                    const PIXELS: usize = $pixels;
+                    let mut top_in = vec![0xff; PIXELS * 4];
+                    let mut bot_in = vec![0xff; PIXELS * 4];
+                    for i in 0..PIXELS {
                         top_in[2 * i + 1] = i as u8;
                         bot_in[2 * i + 1] = !(i as u8);
                     }
-                    let mut top_y_out = vec![0xff; P::PIXELS];
-                    let mut bot_y_out = vec![0xff; P::PIXELS];
-                    let mut u_out = vec![0xff; P::PIXELS / 2];
-                    let mut v_out = vec![0xff; P::PIXELS / 2];
+                    let mut top_y_out = vec![0xff; PIXELS];
+                    let mut bot_y_out = vec![0xff; PIXELS];
+                    let mut u_out = vec![0xff; PIXELS / 2];
+                    let mut v_out = vec![0xff; PIXELS / 2];
                     unsafe {
                         p.process(
+                            PIXELS,
                             top_in.as_ptr(),
                             bot_in.as_ptr(),
                             top_y_out.as_mut_ptr(),
@@ -415,8 +573,8 @@ mod tests {
                             v_out.as_mut_ptr(),
                         )
                     }
-                    let expected_top_y_out: [u8; P::PIXELS] = std::array::from_fn(|i| i as u8);
-                    let expected_bot_y_out: [u8; P::PIXELS] = std::array::from_fn(|i| !(i as u8));
+                    let expected_top_y_out: [u8; PIXELS] = std::array::from_fn(|i| i as u8);
+                    let expected_bot_y_out: [u8; PIXELS] = std::array::from_fn(|i| !(i as u8));
                     assert_eq!(&top_y_out[..], &expected_top_y_out[..]);
                     assert_eq!(&bot_y_out[..], &expected_bot_y_out[..]);
                 }
@@ -425,9 +583,10 @@ mod tests {
                 #[test]
                 fn uv() {
                     let p = P::new().unwrap();
-                    let mut top_in = vec![0xff; P::PIXELS * 4];
-                    let mut bot_in = vec![0xff; P::PIXELS * 4];
-                    for i in 0..P::PIXELS {
+                    const PIXELS: usize = $pixels;
+                    let mut top_in = vec![0xff; PIXELS * 4];
+                    let mut bot_in = vec![0xff; PIXELS * 4];
+                    for i in 0..PIXELS {
                         // u values avg to 0x20 + i (rounding up).
                         top_in[4 * i] = 0x10 + i as u8;
                         bot_in[4 * i] = 0x30 + i as u8;
@@ -436,12 +595,13 @@ mod tests {
                         top_in[4 * i + 2] = 0x80 + i as u8;
                         bot_in[4 * i + 2] = 0xa0 + i as u8;
                     }
-                    let mut top_y_out = vec![0xff; P::PIXELS];
-                    let mut bot_y_out = vec![0xff; P::PIXELS];
-                    let mut u_out = vec![0xff; P::PIXELS / 2];
-                    let mut v_out = vec![0xff; P::PIXELS / 2];
+                    let mut top_y_out = vec![0xff; PIXELS];
+                    let mut bot_y_out = vec![0xff; PIXELS];
+                    let mut u_out = vec![0xff; PIXELS / 2];
+                    let mut v_out = vec![0xff; PIXELS / 2];
                     unsafe {
                         p.process(
+                            PIXELS,
                             top_in.as_ptr(),
                             bot_in.as_ptr(),
                             top_y_out.as_mut_ptr(),
@@ -450,15 +610,14 @@ mod tests {
                             v_out.as_mut_ptr(),
                         )
                     }
-                    let expected_u_out: [u8; P::PIXELS / 2] =
-                        std::array::from_fn(|i| 0x20 + i as u8);
-                    let expected_v_out: [u8; P::PIXELS / 2] =
-                        std::array::from_fn(|i| 0x90 + i as u8);
+                    let expected_u_out: [u8; PIXELS / 2] = std::array::from_fn(|i| 0x20 + i as u8);
+                    let expected_v_out: [u8; PIXELS / 2] = std::array::from_fn(|i| 0x90 + i as u8);
                     assert_eq!(&u_out[..], &expected_u_out[..]);
                     assert_eq!(&v_out[..], &expected_v_out[..]);
                 }
 
                 /// Tests a full realistic frame.
+                #[cfg(not(miri))] // slow!
                 #[test]
                 fn full_frame() {
                     use crate::{
@@ -485,31 +644,75 @@ mod tests {
                     // `assert_eq!` output is unhelpful on these large binary arrays.
                     // On failure, it might be better to write to a file and diff with better tools,
                     // e.g.: `diff -u <(xxd src/testdata/out.yuv) <(xxd actual_out_auto.yuv)`
-                    // std::fs::write(
-                    //     concat!("actual_out_", stringify!($mod), ".yuv"),
-                    //     &actual_out[..],
-                    // )
-                    // .unwrap();
+                    std::fs::write(
+                        concat!("actual_out_", stringify!($mod), ".yuv"),
+                        &actual_out.inner(),
+                    )
+                    .unwrap();
                     assert!(expected_out.planes() == actual_out.planes());
                 }
+
+                /// Tests a 3x3 frame, which is noteworthy in two ways.
+                /// * It exercises the special aliasing last row case.
+                /// * It exercises typical `RowProcessor`s' fallback paths.
+                #[test]
+                #[rustfmt::skip]
+                fn size3x3() {
+                    use crate::{frame::ConsecutiveFrame, PixelFormat};
+                    let uyvy_in = ConsecutiveFrame::new(PixelFormat::UYVY422, 3, 3).with_storage(&[
+                        // U0-1  Y0    V0-1  Y1    U2-2  Y2    V2-2  Yx
+                           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // top row
+                           0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // middle row
+                           0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, // bottom row
+                    ][..]);
+                    let expected_out = ConsecutiveFrame::new(PixelFormat::I420, 3, 3).with_storage(&[
+                        // Y0    Y1    Y2
+                        0x01, 0x03, 0x05, // top row
+                        0x0a, 0x0c, 0x0e, // middle row
+                        0x13, 0x15, 0x17, // bottom row
+
+                        // U0-1 U2-2
+                        0x05, 0x09, // top+middle rows
+                        0x12, 0x16, // bottom row
+
+                        // V0-1 V2-2
+                        0x07,   0x0b, // top+middle rows
+                        0x14,   0x18, // bottom row
+                    ][..]);
+                    let mut actual_out =
+                        ConsecutiveFrame::new(PixelFormat::I420, 3, 3).new_vec();
+                    super::super::convert_with::<P, _, _>(&uyvy_in, &mut actual_out).unwrap();
+                    assert_eq!(expected_out.inner(), actual_out.inner());
+                }
             }
         };
     }
 
     #[cfg(target_arch = "x86_64")]
-    test_block!(super::super::ExplicitAvx2DoubleBlock, explicit_double_avx2);
+    #[cfg(not(miri))] // vendor instrinsics are unsupported on miri.
+    test_processor!(
+        super::super::ExplicitAvx2DoubleBlock,
+        explicit_double_avx2,
+        64
+    );
 
     #[cfg(target_arch = "x86_64")]
-    test_block!(super::super::ExplicitAvx2SingleBlock, explicit_single_avx2);
+    #[cfg(not(miri))] // vendor instrinsics are unsupported on miri.
+    test_processor!(
+        super::super::ExplicitAvx2SingleBlock,
+        explicit_single_avx2,
+        32
+    );
 
     #[cfg(target_arch = "x86_64")]
-    test_block!(super::super::AutoAvx2Block<32>, auto_avx2);
+    test_processor!(super::super::AutoAvx2Block<32>, auto_avx2, 32);
 
     #[cfg(target_arch = "aarch64")]
-    test_block!(super::super::AutoNeonBlock<64>, auto_neon);
+    test_processor!(super::super::AutoNeonBlock<64>, auto_neon, 64);
 
     #[cfg(target_arch = "aarch64")]
-    test_block!(super::super::ExplicitNeon, explicit_neon);
+    #[cfg(not(miri))] // vendor instrinsics are unsupported on miri.
+    test_processor!(super::super::ExplicitNeon, explicit_neon, 32);
 
-    test_block!(super::super::AutoVanillaBlock<32>, auto);
+    test_processor!(super::super::AutoVanillaBlock<32>, auto, 32);
 }