From 87f9976f2a943ff9a7cc0e122324d47027a71514 Mon Sep 17 00:00:00 2001 From: Scott Lamb Date: Thu, 20 Jun 2024 18:21:34 -0700 Subject: [PATCH] support funny sizes * refine `PlaneDims` logic for uyvy dimension calculation * relax constraints on `uyvy_to_i420::convert` * process rows, not just blocks, with processors deferring to a fallback. In theory at least the AVX one could use masked instructions instead, but the fallback approach was easy. * test with miri --- .github/workflows/push.yml | 16 ++ README.md | 10 +- src/frame.rs | 57 ++-- src/lib.rs | 69 ++++- src/uyvy_to_i420.rs | 517 ++++++++++++++++++++++++++----------- 5 files changed, 477 insertions(+), 192 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 01cd156..ff74128 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -12,6 +12,7 @@ jobs: runs-on: ${{ matrix.runner }} strategy: + fail-fast: false matrix: runner: [ubuntu-latest, macos-latest] steps: @@ -31,3 +32,18 @@ jobs: - run: cargo clippy -- --deny warnings - run: cargo test --release - run: cargo criterion + + miri: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: nightly + components: miri + - name: Cache + uses: Swatinem/rust-cache@v2 + with: + key: miri + - run: cargo miri test \ No newline at end of file diff --git a/README.md b/README.md index e78f148..e485678 100644 --- a/README.md +++ b/README.md @@ -14,14 +14,10 @@ Limitations and future work: * Returns `Err` on x86\_64 CPUs that don't support AVX2. We'll likely add an SSE2 fallback later. As SSE2 is in the core x86\_64 instruction set, this would mean all x86\_64 CPUs would be supported. -* Returns `Err` for frame widths that aren't a multiple - of 64 pixels (for AVX2) or 32 pixels (for NEON). This could be eased via a - scalar fallback path for the remaining pixels, or (for AVX2) masked - load/store instructions. -* Returns `Err` for frame heights that aren't a multiple of 2. * Expects to process full horizontal lines. This is likely to - change to allow working on cropped regions and outputting to frames with - extra padding between lines as required by some APIs/devices. + change to allow working on cropped regions. +* Does not support output to a frame with padding, as required by some +* APIs/devices. * The ARM NEON code is less optimized than the AVX2 code today. You may find the notes in [`docs/simd.md`](docs/simd.md) helpful if you are new diff --git a/src/frame.rs b/src/frame.rs index 69a3d51..ad36b15 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -52,14 +52,6 @@ pub unsafe trait Frame { /// Returns true if this frame has been fully initialized. fn initialized(&self) -> bool; - /// Marks this frame as fully initialized. - /// - /// # Safety - /// - /// The caller must ensure that the frame is fully initialized, including - /// any padding bytes. - unsafe fn initialize(&mut self); - /// Returns the (image format-defined) planes for read/shared access. fn planes(&self) -> ArrayVec; } @@ -74,6 +66,14 @@ pub unsafe trait Frame { pub unsafe trait FrameMut: Frame { /// Returns the (image format-defined) planes for mutation/exclusive access. fn planes_mut(&mut self) -> ArrayVec; + + /// Marks this frame as fully initialized. + /// + /// # Safety + /// + /// The caller must ensure that the frame is fully initialized, including + /// any padding bytes. + unsafe fn initialize(&mut self); } /// Provides read-only access to a given image plane. @@ -247,6 +247,16 @@ pub unsafe trait Storage { /// Returns a raw pointer to the start of the storage. fn as_ptr(&self) -> *const u8; +} + +/// Write access to a backing buffer for a [`ConsecutiveFrame`]. +/// +/// # Safety +/// +/// As in [`Storage`]. +pub unsafe trait StorageMut: Storage { + /// Returns a raw pointer to the start of the storage. + fn as_mut_ptr(&mut self) -> *mut u8; /// Notes that this storage is initialized, up to length `len`. /// @@ -262,16 +272,6 @@ pub unsafe trait Storage { unsafe fn initialize(&mut self, len: usize) {} } -/// Write access to a backing buffer for a [`ConsecutiveFrame`]. -/// -/// # Safety -/// -/// As in [`Storage`]. -pub unsafe trait StorageMut: Storage { - /// Returns a raw pointer to the start of the storage. - fn as_mut_ptr(&mut self) -> *mut u8; -} - unsafe impl Storage for Vec { #[inline] fn check_len(&self, len: usize) -> bool { @@ -289,6 +289,11 @@ unsafe impl StorageMut for Vec { fn as_mut_ptr(&mut self) -> *mut u8 { self.as_mut_ptr() } + + #[inline] + unsafe fn initialize(&mut self, len: usize) { + unsafe { self.set_len(len) }; + } } macro_rules! impl_slice_storage { @@ -478,14 +483,6 @@ unsafe impl Frame for ConsecutiveFrame { } planes } - - #[inline] - unsafe fn initialize(&mut self) { - if !self.initialized { - self.storage.initialize(self.total_size()); - self.initialized = true; - } - } } unsafe impl FrameMut for ConsecutiveFrame { @@ -506,4 +503,12 @@ unsafe impl FrameMut for ConsecutiveFrame { } planes } + + #[inline] + unsafe fn initialize(&mut self) { + if !self.initialized { + self.storage.initialize(self.total_size()); + self.initialized = true; + } + } } diff --git a/src/lib.rs b/src/lib.rs index 303b866..320d086 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,11 +40,17 @@ pub enum PixelFormat { /// [UYVY](https://fourcc.org/pixel-format/yuv-uyvy/). /// /// Matches ffmpeg's `AV_PIX_FMT_UYVY422`: "packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1". + /// + /// For odd-width images, the width is rounded up to the next multiple of 2, + /// with the final `Y` as a don't-care byte, and the final chroma values not + /// subsampled. UYVY422, /// [I420](https://fourcc.org/pixel-format/yuv-i420/). /// /// Matches ffmpeg's `AV_PIX_FMT_YUV420P`: "planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)". + /// + /// For odd-width and odd-height images, the final pixel is not subsampled. I420, /// BGRA. @@ -81,7 +87,11 @@ impl PixelFormat { match self { PixelFormat::UYVY422 => { sizes.push(PlaneDims { - stride: width.checked_shl(1).expect("stride should not overflow"), + // Round to next multiple of 2, then double. + stride: width + .checked_add(width & 1) + .and_then(|w| w.checked_shl(1)) + .expect("stride should not overflow"), rows: height, }); } @@ -91,8 +101,8 @@ impl PixelFormat { stride: width, rows: height, }); + // U/V planes. let chroma_plane_size = PlaneDims { - // U/V planes. // Overflow-safe divide by two that rounds up. stride: (width >> 1) + (width & 1), rows: (height >> 1) + (height & 1), @@ -120,3 +130,58 @@ impl PixelFormat { } } } + +#[cfg(test)] +mod tests { + #[test] + fn odd_sizes() { + assert_eq!( + super::PixelFormat::UYVY422 + .min_plane_dims(1, 1) + .collect::>(), + vec![super::PlaneDims { stride: 4, rows: 1 }] + ); + assert_eq!( + super::PixelFormat::UYVY422 + .min_plane_dims(2, 2) + .collect::>(), + vec![super::PlaneDims { stride: 4, rows: 2 }] + ); + assert_eq!( + super::PixelFormat::UYVY422 + .min_plane_dims(3, 3) + .collect::>(), + vec![super::PlaneDims { stride: 8, rows: 3 }] + ); + assert_eq!( + super::PixelFormat::I420 + .min_plane_dims(1, 1) + .collect::>(), + vec![ + super::PlaneDims { stride: 1, rows: 1 }, + super::PlaneDims { stride: 1, rows: 1 }, + super::PlaneDims { stride: 1, rows: 1 } + ] + ); + assert_eq!( + super::PixelFormat::I420 + .min_plane_dims(2, 2) + .collect::>(), + vec![ + super::PlaneDims { stride: 2, rows: 2 }, + super::PlaneDims { stride: 1, rows: 1 }, + super::PlaneDims { stride: 1, rows: 1 } + ] + ); + assert_eq!( + super::PixelFormat::I420 + .min_plane_dims(3, 3) + .collect::>(), + vec![ + super::PlaneDims { stride: 3, rows: 3 }, + super::PlaneDims { stride: 2, rows: 2 }, + super::PlaneDims { stride: 2, rows: 2 } + ] + ); + } +} diff --git a/src/uyvy_to_i420.rs b/src/uyvy_to_i420.rs index 91aefcd..e203102 100644 --- a/src/uyvy_to_i420.rs +++ b/src/uyvy_to_i420.rs @@ -18,25 +18,25 @@ use crate::{ ConversionError, }; -/// Processes a block `PIXELS` columns wide, 2 rows high. +/// Processes a block of 2 rows. #[doc(hidden)] -pub trait BlockProcessor: Copy + Clone + Sized + Send + Sync { - /// The width of this block in pixels. - const PIXELS: usize; - +pub trait RowProcessor: Copy + Clone + Sized + Send + Sync { /// Returns true if this block type is supported on this machine. fn new() -> Result; - /// Processes a block `PIXELS` wide, two rows high. + /// Processes a block `width` pixels wide, two rows high. /// /// # Safety /// /// Caller must ensure the following: - /// * `top_uyvy_addr` and `bot_uyvy_addr` each contain `2 * PIXELS` bytes of initialized data. - /// * `top_y_addr` and `bot_y_addr` are each valid destinations for `PIXELS` bytes. - /// * `u_addr` and `v_addr` are each valid destinations for `PIXELS / 2` bytes. + /// * `top_uyvy_addr` and `bot_uyvy_addr` each contain `2 * width` bytes of initialized data. + /// * `top_y_addr` and `bot_y_addr` are each valid destinations for `width` bytes. + /// They may however alias each other, which would not be allowed with `&mut`. + /// * `u_addr` and `v_addr` are each valid destinations for `(width + 1) / 2` bytes. + #[allow(clippy::too_many_arguments)] unsafe fn process( self, + width: usize, top_uyvy_addr: *const u8, bot_uyvy_addr: *const u8, top_y_addr: *mut u8, @@ -64,7 +64,7 @@ pub fn convert( } #[doc(hidden)] -pub fn convert_with( +pub fn convert_with( uyvy_in: &FI, yuv_out: &mut FO, ) -> Result<(), ConversionError> { @@ -76,10 +76,6 @@ pub fn convert_with( { return Err(ConversionError("invalid arguments")); } - if width % P::PIXELS != 0 || height % 2 != 0 { - // TODO: support irregular sizes. - return Err(ConversionError("irregular sizes unsupported")); - } let p = P::new()?; let pixels = width * height; let uyvy_planes = uyvy_in.planes(); @@ -90,31 +86,53 @@ pub fn convert_with( let [y_out, u_out, v_out] = &mut yuv_planes[..] else { panic!("yuv_out must have three planes"); }; - if y_out.stride() != width || u_out.stride() != width / 2 || v_out.stride() != width / 2 { - // TODO: support padding. - return Err(ConversionError("padding unsupported")); + let chroma_width = (width >> 1) + (width & 1); + let chroma_rows = (height >> 1) + (height & 1); + let chroma_size = chroma_width * chroma_rows; + if y_out.stride() != width || u_out.stride() != chroma_width || v_out.stride() != chroma_width { + return Err(ConversionError("output padding unsupported")); // TODO } - assert_eq!(uyvy_in.len(), pixels * 2); + let uyvy_stride = uyvy_in.stride(); + assert!(uyvy_stride >= width * 2); + assert!(uyvy_in.len() >= height * uyvy_stride - (uyvy_stride - width * 2)); let uyvy_in = uyvy_in.as_ptr(); assert_eq!(y_out.len(), pixels); - assert_eq!(u_out.len(), pixels / 4); - assert_eq!(v_out.len(), pixels / 4); - let y_out = y_out.as_mut_ptr().cast::(); - let uyvy_row_stride = 2 * width; // TODO: support line padding? - let u_out = u_out.as_mut_ptr().cast::(); - let v_out = v_out.as_mut_ptr().cast::(); - for r in (0..height).step_by(2) { - for c in (0..width).step_by(P::PIXELS) { - unsafe { - p.process( - uyvy_in.add(r * uyvy_row_stride + 2 * c), - uyvy_in.add((r + 1) * uyvy_row_stride + 2 * c), - y_out.add(r * width + c), - y_out.add((r + 1) * width + c), - u_out.add((r >> 1) * (width >> 1) + (c >> 1)), - v_out.add((r >> 1) * (width >> 1) + (c >> 1)), - ); - } + assert_eq!(u_out.len(), chroma_size); + assert_eq!(v_out.len(), chroma_size); + let y_out = y_out.as_mut_ptr(); + let u_out = u_out.as_mut_ptr(); + let v_out = v_out.as_mut_ptr(); + let mut r = 0; + loop { + if r + 2 > height { + break; + } + unsafe { + p.process( + width, + uyvy_in.add(r * uyvy_stride), + uyvy_in.add((r + 1) * uyvy_stride), + y_out.add(r * width), + y_out.add((r + 1) * width), + u_out.add((r >> 1) * chroma_width), + v_out.add((r >> 1) * chroma_width), + ); + } + r += 2; + } + if r < height { + // Process the last row, without subsampling vertically. + unsafe { + // XXX: use p.process. + fallback( + width, + uyvy_in.add(r * uyvy_stride), + uyvy_in.add(r * uyvy_stride), // aliased! + y_out.add(r * width), + y_out.add(r * width), // aliased! + u_out.add((r >> 1) * chroma_width), + v_out.add((r >> 1) * chroma_width), + ); } } drop(yuv_planes); @@ -136,15 +154,44 @@ unsafe fn hexprint(v: std::arch::x86_64::__m256i) -> impl std::fmt::Display { ) } +unsafe fn fallback( + width: usize, + top_uyvy_addr: *const u8, + bot_uyvy_addr: *const u8, + top_y_addr: *mut u8, + bot_y_addr: *mut u8, + u_addr: *mut u8, + v_addr: *mut u8, +) { + for i in 0..width { + std::ptr::write( + top_y_addr.add(i), + std::ptr::read(top_uyvy_addr.add(2 * i + 1)), + ); + std::ptr::write( + bot_y_addr.add(i), + std::ptr::read(bot_uyvy_addr.add(2 * i + 1)), + ); + } + let avg = |a: u8, b: u8| ((u16::from(a) + u16::from(b) + 1) >> 1) as u8; + let chroma_width = (width >> 1) + (width & 1); + for i in 0..chroma_width { + let top_u = std::ptr::read(top_uyvy_addr.add(4 * i)); + let bot_u = std::ptr::read(bot_uyvy_addr.add(4 * i)); + let top_v = std::ptr::read(top_uyvy_addr.add(4 * i + 2)); + let bot_v = std::ptr::read(bot_uyvy_addr.add(4 * i + 2)); + std::ptr::write(u_addr.add(i), avg(top_u, bot_u)); + std::ptr::write(v_addr.add(i), avg(top_v, bot_v)); + } +} + #[cfg(target_arch = "x86_64")] #[doc(hidden)] #[derive(Copy, Clone)] pub struct ExplicitAvx2DoubleBlock(()); #[cfg(target_arch = "x86_64")] -impl BlockProcessor for ExplicitAvx2DoubleBlock { - const PIXELS: usize = 64; - +impl RowProcessor for ExplicitAvx2DoubleBlock { #[inline] fn new() -> Result { if is_x86_feature_detected!("avx2") { @@ -158,6 +205,7 @@ impl BlockProcessor for ExplicitAvx2DoubleBlock { #[inline(never)] unsafe fn process( self, + width: usize, top_uyvy_addr: *const u8, bot_uyvy_addr: *const u8, top_y_addr: *mut u8, @@ -171,41 +219,68 @@ impl BlockProcessor for ExplicitAvx2DoubleBlock { 1, 3, 5, 7, 9, 11, 13, 15, // lower half: 8 Y components. 0, 4, 8, 12, 2, 6, 10, 14, // upper half: (4 * U), (4 * V). )); - let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 4] { - std::array::from_fn(|i| { - // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5 - let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _); - // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12. - x86_64::_mm256_shuffle_epi8(raw, shuf_indices) - }) - }); - for (data, addr) in [(top, top_y_addr), (bot, bot_y_addr)] { - for i in [0, 1] { - // Put into 64-groups: y0 y2 y1 y3. - // VPUNPCKLQDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12 - let swapped = x86_64::_mm256_unpacklo_epi64(data[2 * i], data[2 * i + 1]); - - // Swap y2 and y1 to produce: y0 y1 y2 y3. - // VPERMQ (YMM, YMM, I8) on Zen2: lat 6, cpi 1.27 - x86_64::_mm256_storeu_si256( - addr.add(32 * i) as _, - x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(swapped), - ); + + // Process the nice blocks. + const BLOCK_SIZE: usize = 64; + let mut i = 0; + loop { + let top_uyvy_addr = top_uyvy_addr.add(2 * i); + let bot_uyvy_addr = bot_uyvy_addr.add(2 * i); + let top_y_addr = top_y_addr.add(i); + let bot_y_addr = bot_y_addr.add(i); + let u_addr = u_addr.add(i / 2); + let v_addr = v_addr.add(i / 2); + if i + BLOCK_SIZE > width { + break; + } + let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 4] { + std::array::from_fn(|i| { + // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5 + let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _); + // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12. + x86_64::_mm256_shuffle_epi8(raw, shuf_indices) + }) + }); + for (data, addr) in [(top, top_y_addr), (bot, bot_y_addr)] { + for i in [0, 1] { + // Put into 64-groups: y0 y2 y1 y3. + // VPUNPCKLQDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12 + let swapped = x86_64::_mm256_unpacklo_epi64(data[2 * i], data[2 * i + 1]); + + // Swap y2 and y1 to produce: y0 y1 y2 y3. + // VPERMQ (YMM, YMM, I8) on Zen2: lat 6, cpi 1.27 + x86_64::_mm256_storeu_si256( + addr.add(32 * i) as _, + x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(swapped), + ); + } } + let uv: [_; 2] = std::array::from_fn(|i| { + // unpackhi_epi32(data[0], data[1]) returns (u0 u2 v0 v2) (u1 u3 v1 v3). + // VPUNPCKHDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12 + x86_64::_mm256_avg_epu8( + x86_64::_mm256_unpackhi_epi32(top[2 * i], top[2 * i + 1]), + x86_64::_mm256_unpackhi_epi32(bot[2 * i], bot[2 * i + 1]), + ) + }); + let mix = x86_64::_mm256_permute2x128_si256::<0b_00_11_00_01>(uv[0], uv[1]); + let uv0prime = + x86_64::_mm256_inserti128_si256::<1>(uv[0], x86_64::_mm256_castsi256_si128(uv[1])); + x86_64::_mm256_storeu_si256(u_addr as _, x86_64::_mm256_unpacklo_epi32(uv0prime, mix)); + x86_64::_mm256_storeu_si256(v_addr as _, x86_64::_mm256_unpackhi_epi32(uv0prime, mix)); + i += BLOCK_SIZE; + } + if i < width { + fallback( + width - i, + top_uyvy_addr.add(2 * i), + bot_uyvy_addr.add(2 * i), + top_y_addr.add(i), + bot_y_addr.add(i), + u_addr.add(i / 2), + v_addr.add(i / 2), + ); } - let uv: [_; 2] = std::array::from_fn(|i| { - // unpackhi_epi32(data[0], data[1]) returns (u0 u2 v0 v2) (u1 u3 v1 v3). - // VPUNPCKHDQ (YMM, YMM, YMM) on Zen2: lat 1, cpi 0.50, ports 1*FP12 - x86_64::_mm256_avg_epu8( - x86_64::_mm256_unpackhi_epi32(top[2 * i], top[2 * i + 1]), - x86_64::_mm256_unpackhi_epi32(bot[2 * i], bot[2 * i + 1]), - ) - }); - let mix = x86_64::_mm256_permute2x128_si256::<0b_00_11_00_01>(uv[0], uv[1]); - let uv0prime = - x86_64::_mm256_inserti128_si256::<1>(uv[0], x86_64::_mm256_castsi256_si128(uv[1])); - x86_64::_mm256_storeu_si256(u_addr as _, x86_64::_mm256_unpacklo_epi32(uv0prime, mix)); - x86_64::_mm256_storeu_si256(v_addr as _, x86_64::_mm256_unpackhi_epi32(uv0prime, mix)); } } @@ -215,9 +290,7 @@ impl BlockProcessor for ExplicitAvx2DoubleBlock { pub struct ExplicitAvx2SingleBlock(()); #[cfg(target_arch = "x86_64")] -impl BlockProcessor for ExplicitAvx2SingleBlock { - const PIXELS: usize = 32; - +impl RowProcessor for ExplicitAvx2SingleBlock { #[inline] fn new() -> Result { if is_x86_feature_detected!("avx2") { @@ -231,6 +304,7 @@ impl BlockProcessor for ExplicitAvx2SingleBlock { #[target_feature(enable = "avx2")] unsafe fn process( self, + width: usize, top_uyvy_addr: *const u8, bot_uyvy_addr: *const u8, top_y_addr: *mut u8, @@ -244,33 +318,59 @@ impl BlockProcessor for ExplicitAvx2SingleBlock { 1, 3, 5, 7, 9, 11, 13, 15, // lower half: 8 Y components. 0, 4, 8, 12, 2, 6, 10, 14, // upper half: (4 * U), (4 * V). )); - let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 2] { - std::array::from_fn(|i| { - // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5 - let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _); - // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12. - x86_64::_mm256_shuffle_epi8(raw, shuf_indices) - }) - }); - for (data, y_addr) in [(top, top_y_addr), (bot, bot_y_addr)] { - let y = x86_64::_mm256_unpacklo_epi64(data[0], data[1]); - // VMOVDQU (M256, YMM) on Zen2: ports 1*FP2. - x86_64::_mm256_storeu_si256( - y_addr as _, - x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(y), + // Process the nice blocks. + const BLOCK_SIZE: usize = 32; + let mut i = 0; + loop { + let top_uyvy_addr = top_uyvy_addr.add(2 * i); + let bot_uyvy_addr = bot_uyvy_addr.add(2 * i); + let top_y_addr = top_y_addr.add(i); + let bot_y_addr = bot_y_addr.add(i); + let u_addr = u_addr.add(i / 2); + let v_addr = v_addr.add(i / 2); + if i + BLOCK_SIZE > width { + break; + } + let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 2] { + std::array::from_fn(|i| { + // VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5 + let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _); + // VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12. + x86_64::_mm256_shuffle_epi8(raw, shuf_indices) + }) + }); + for (data, y_addr) in [(top, top_y_addr), (bot, bot_y_addr)] { + let y = x86_64::_mm256_unpacklo_epi64(data[0], data[1]); + // VMOVDQU (M256, YMM) on Zen2: ports 1*FP2. + x86_64::_mm256_storeu_si256( + y_addr as _, + x86_64::_mm256_permute4x64_epi64::<0b11_01_10_00>(y), + ); + } + + let uv = x86_64::_mm256_avg_epu8( + x86_64::_mm256_unpackhi_epi32(top[0], top[1]), + x86_64::_mm256_unpackhi_epi32(bot[0], bot[1]), + ); + let p = x86_64::_mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + x86_64::_mm256_storeu2_m128i( + v_addr as _, + u_addr as _, + x86_64::_mm256_permutevar8x32_epi32(uv, p), + ); + i += BLOCK_SIZE; + } + if i < width { + fallback( + width - i, + top_uyvy_addr.add(2 * i), + bot_uyvy_addr.add(2 * i), + top_y_addr.add(i), + bot_y_addr.add(i), + u_addr.add(i / 2), + v_addr.add(i / 2), ); } - - let uv = x86_64::_mm256_avg_epu8( - x86_64::_mm256_unpackhi_epi32(top[0], top[1]), - x86_64::_mm256_unpackhi_epi32(bot[0], bot[1]), - ); - let p = x86_64::_mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); - x86_64::_mm256_storeu2_m128i( - v_addr as _, - u_addr as _, - x86_64::_mm256_permutevar8x32_epi32(uv, p), - ); } } @@ -280,9 +380,7 @@ impl BlockProcessor for ExplicitAvx2SingleBlock { pub struct ExplicitNeon(()); #[cfg(target_arch = "aarch64")] -impl BlockProcessor for ExplicitNeon { - const PIXELS: usize = 32; - +impl RowProcessor for ExplicitNeon { fn new() -> Result { if std::arch::is_aarch64_feature_detected!("neon") { Ok(Self(())) @@ -295,6 +393,7 @@ impl BlockProcessor for ExplicitNeon { #[target_feature(enable = "neon")] unsafe fn process( self, + width: usize, top_uyvy_addr: *const u8, bot_uyvy_addr: *const u8, top_y_addr: *mut u8, @@ -302,12 +401,43 @@ impl BlockProcessor for ExplicitNeon { u_addr: *mut u8, v_addr: *mut u8, ) { - let top_uyvy = aarch64::vld4q_u8(top_uyvy_addr); - let bot_uyvy = aarch64::vld4q_u8(bot_uyvy_addr); - aarch64::vst2q_u8(top_y_addr, aarch64::uint8x16x2_t(top_uyvy.1, top_uyvy.3)); - aarch64::vst2q_u8(bot_y_addr, aarch64::uint8x16x2_t(bot_uyvy.1, bot_uyvy.3)); - aarch64::vst1q_u8(u_addr, aarch64::vrhaddq_u8(top_uyvy.0, bot_uyvy.0)); - aarch64::vst1q_u8(v_addr, aarch64::vrhaddq_u8(top_uyvy.2, bot_uyvy.2)); + const BLOCK_SIZE: usize = 32; + let mut i = 0; + loop { + if i + BLOCK_SIZE > width { + break; + } + let top_uyvy = aarch64::vld4q_u8(top_uyvy_addr.add(2 * i)); + let bot_uyvy = aarch64::vld4q_u8(bot_uyvy_addr.add(2 * i)); + aarch64::vst2q_u8( + top_y_addr.add(i), + aarch64::uint8x16x2_t(top_uyvy.1, top_uyvy.3), + ); + aarch64::vst2q_u8( + bot_y_addr.add(i), + aarch64::uint8x16x2_t(bot_uyvy.1, bot_uyvy.3), + ); + aarch64::vst1q_u8( + u_addr.add(i / 2), + aarch64::vrhaddq_u8(top_uyvy.0, bot_uyvy.0), + ); + aarch64::vst1q_u8( + v_addr.add(i / 2), + aarch64::vrhaddq_u8(top_uyvy.2, bot_uyvy.2), + ); + i += BLOCK_SIZE; + } + if i < width { + fallback( + width - i, + top_uyvy_addr.add(2 * i), + bot_uyvy_addr.add(2 * i), + top_y_addr.add(i), + bot_y_addr.add(i), + u_addr.add(i / 2), + v_addr.add(i / 2), + ); + } } } @@ -322,9 +452,7 @@ macro_rules! auto { #[derive(Copy, Clone)] pub struct $ident([(); PIXELS]); - impl BlockProcessor for $ident { - const PIXELS: usize = PIXELS; - + impl RowProcessor for $ident { #[inline(always)] fn new() -> Result { if true && $($supported)+ { @@ -338,6 +466,7 @@ macro_rules! auto { $(#[target_feature(enable = $feature)])* unsafe fn process( self, + width: usize, top_uyvy_addr: *const u8, bot_uyvy_addr: *const u8, top_y_addr: *mut u8, @@ -345,18 +474,45 @@ macro_rules! auto { u_addr: *mut u8, v_addr: *mut u8, ) { - for i in 0..PIXELS { - std::ptr::write(top_y_addr.add(i), std::ptr::read(top_uyvy_addr.add(2*i + 1))); - std::ptr::write(bot_y_addr.add(i), std::ptr::read(bot_uyvy_addr.add(2*i + 1))); + let mut i = 0; + + // The u/v logic below doesn't handle an odd number of pixels per block. + const { assert!(PIXELS % 2 == 0); } + loop { + if i + PIXELS > width { + break; + } + let top_uyvy_addr = top_uyvy_addr.add(2 * i); + let bot_uyvy_addr = bot_uyvy_addr.add(2 * i); + let top_y_addr = top_y_addr.add(i); + let bot_y_addr = bot_y_addr.add(i); + let u_addr = u_addr.add(i / 2); + let v_addr = v_addr.add(i / 2); + for j in 0..PIXELS { + std::ptr::write(top_y_addr.add(j), std::ptr::read(top_uyvy_addr.add(2*j + 1))); + std::ptr::write(bot_y_addr.add(j), std::ptr::read(bot_uyvy_addr.add(2*j + 1))); + } + let avg = |a: u8, b: u8| { (u16::from(a) + u16::from(b) + 1 >> 1) as u8 }; + for j in 0..PIXELS/2 { + let top_u = std::ptr::read(top_uyvy_addr.add(4*j)); + let bot_u = std::ptr::read(bot_uyvy_addr.add(4*j)); + let top_v = std::ptr::read(top_uyvy_addr.add(4*j + 2)); + let bot_v = std::ptr::read(bot_uyvy_addr.add(4*j + 2)); + std::ptr::write(u_addr.add(j), avg(top_u, bot_u)); + std::ptr::write(v_addr.add(j), avg(top_v, bot_v)); + } + i += PIXELS; } - let avg = |a: u8, b: u8| { (u16::from(a) + u16::from(b) + 1 >> 1) as u8 }; - for i in 0..Self::PIXELS/2 { - let top_u = std::ptr::read(top_uyvy_addr.add(4*i)); - let bot_u = std::ptr::read(bot_uyvy_addr.add(4*i)); - let top_v = std::ptr::read(top_uyvy_addr.add(4*i + 2)); - let bot_v = std::ptr::read(bot_uyvy_addr.add(4*i + 2)); - std::ptr::write(u_addr.add(i), avg(top_u, bot_u)); - std::ptr::write(v_addr.add(i), avg(top_v, bot_v)); + if i < width { + fallback( + width - i, + top_uyvy_addr.add(2 * i), + bot_uyvy_addr.add(2 * i), + top_y_addr.add(i), + bot_y_addr.add(i), + u_addr.add(i / 2), + v_addr.add(i / 2), + ); } } } @@ -385,28 +541,30 @@ auto! { #[cfg(test)] mod tests { - macro_rules! test_block { - ($processor: ty, $mod: ident) => { + macro_rules! test_processor { + ($processor: ty, $mod: ident, $pixels: expr) => { mod $mod { - use super::super::BlockProcessor as _; + use super::super::RowProcessor as _; type P = $processor; /// Tests that a single `process` call produces the right `y` plane bytes. #[test] fn y() { let p = P::new().unwrap(); - let mut top_in = vec![0xff; P::PIXELS * 4]; - let mut bot_in = vec![0xff; P::PIXELS * 4]; - for i in 0..P::PIXELS { + const PIXELS: usize = $pixels; + let mut top_in = vec![0xff; PIXELS * 4]; + let mut bot_in = vec![0xff; PIXELS * 4]; + for i in 0..PIXELS { top_in[2 * i + 1] = i as u8; bot_in[2 * i + 1] = !(i as u8); } - let mut top_y_out = vec![0xff; P::PIXELS]; - let mut bot_y_out = vec![0xff; P::PIXELS]; - let mut u_out = vec![0xff; P::PIXELS / 2]; - let mut v_out = vec![0xff; P::PIXELS / 2]; + let mut top_y_out = vec![0xff; PIXELS]; + let mut bot_y_out = vec![0xff; PIXELS]; + let mut u_out = vec![0xff; PIXELS / 2]; + let mut v_out = vec![0xff; PIXELS / 2]; unsafe { p.process( + PIXELS, top_in.as_ptr(), bot_in.as_ptr(), top_y_out.as_mut_ptr(), @@ -415,8 +573,8 @@ mod tests { v_out.as_mut_ptr(), ) } - let expected_top_y_out: [u8; P::PIXELS] = std::array::from_fn(|i| i as u8); - let expected_bot_y_out: [u8; P::PIXELS] = std::array::from_fn(|i| !(i as u8)); + let expected_top_y_out: [u8; PIXELS] = std::array::from_fn(|i| i as u8); + let expected_bot_y_out: [u8; PIXELS] = std::array::from_fn(|i| !(i as u8)); assert_eq!(&top_y_out[..], &expected_top_y_out[..]); assert_eq!(&bot_y_out[..], &expected_bot_y_out[..]); } @@ -425,9 +583,10 @@ mod tests { #[test] fn uv() { let p = P::new().unwrap(); - let mut top_in = vec![0xff; P::PIXELS * 4]; - let mut bot_in = vec![0xff; P::PIXELS * 4]; - for i in 0..P::PIXELS { + const PIXELS: usize = $pixels; + let mut top_in = vec![0xff; PIXELS * 4]; + let mut bot_in = vec![0xff; PIXELS * 4]; + for i in 0..PIXELS { // u values avg to 0x20 + i (rounding up). top_in[4 * i] = 0x10 + i as u8; bot_in[4 * i] = 0x30 + i as u8; @@ -436,12 +595,13 @@ mod tests { top_in[4 * i + 2] = 0x80 + i as u8; bot_in[4 * i + 2] = 0xa0 + i as u8; } - let mut top_y_out = vec![0xff; P::PIXELS]; - let mut bot_y_out = vec![0xff; P::PIXELS]; - let mut u_out = vec![0xff; P::PIXELS / 2]; - let mut v_out = vec![0xff; P::PIXELS / 2]; + let mut top_y_out = vec![0xff; PIXELS]; + let mut bot_y_out = vec![0xff; PIXELS]; + let mut u_out = vec![0xff; PIXELS / 2]; + let mut v_out = vec![0xff; PIXELS / 2]; unsafe { p.process( + PIXELS, top_in.as_ptr(), bot_in.as_ptr(), top_y_out.as_mut_ptr(), @@ -450,15 +610,14 @@ mod tests { v_out.as_mut_ptr(), ) } - let expected_u_out: [u8; P::PIXELS / 2] = - std::array::from_fn(|i| 0x20 + i as u8); - let expected_v_out: [u8; P::PIXELS / 2] = - std::array::from_fn(|i| 0x90 + i as u8); + let expected_u_out: [u8; PIXELS / 2] = std::array::from_fn(|i| 0x20 + i as u8); + let expected_v_out: [u8; PIXELS / 2] = std::array::from_fn(|i| 0x90 + i as u8); assert_eq!(&u_out[..], &expected_u_out[..]); assert_eq!(&v_out[..], &expected_v_out[..]); } /// Tests a full realistic frame. + #[cfg(not(miri))] // slow! #[test] fn full_frame() { use crate::{ @@ -485,31 +644,75 @@ mod tests { // `assert_eq!` output is unhelpful on these large binary arrays. // On failure, it might be better to write to a file and diff with better tools, // e.g.: `diff -u <(xxd src/testdata/out.yuv) <(xxd actual_out_auto.yuv)` - // std::fs::write( - // concat!("actual_out_", stringify!($mod), ".yuv"), - // &actual_out[..], - // ) - // .unwrap(); + std::fs::write( + concat!("actual_out_", stringify!($mod), ".yuv"), + &actual_out.inner(), + ) + .unwrap(); assert!(expected_out.planes() == actual_out.planes()); } + + /// Tests a 3x3 frame, which is noteworthy in two ways. + /// * It exercises the special aliasing last row case. + /// * It exercises typical `RowProcessor`s' fallback paths. + #[test] + #[rustfmt::skip] + fn size3x3() { + use crate::{frame::ConsecutiveFrame, PixelFormat}; + let uyvy_in = ConsecutiveFrame::new(PixelFormat::UYVY422, 3, 3).with_storage(&[ + // U0-1 Y0 V0-1 Y1 U2-2 Y2 V2-2 Yx + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // top row + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // middle row + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, // bottom row + ][..]); + let expected_out = ConsecutiveFrame::new(PixelFormat::I420, 3, 3).with_storage(&[ + // Y0 Y1 Y2 + 0x01, 0x03, 0x05, // top row + 0x0a, 0x0c, 0x0e, // middle row + 0x13, 0x15, 0x17, // bottom row + + // U0-1 U2-2 + 0x05, 0x09, // top+middle rows + 0x12, 0x16, // bottom row + + // V0-1 V2-2 + 0x07, 0x0b, // top+middle rows + 0x14, 0x18, // bottom row + ][..]); + let mut actual_out = + ConsecutiveFrame::new(PixelFormat::I420, 3, 3).new_vec(); + super::super::convert_with::(&uyvy_in, &mut actual_out).unwrap(); + assert_eq!(expected_out.inner(), actual_out.inner()); + } } }; } #[cfg(target_arch = "x86_64")] - test_block!(super::super::ExplicitAvx2DoubleBlock, explicit_double_avx2); + #[cfg(not(miri))] // vendor instrinsics are unsupported on miri. + test_processor!( + super::super::ExplicitAvx2DoubleBlock, + explicit_double_avx2, + 64 + ); #[cfg(target_arch = "x86_64")] - test_block!(super::super::ExplicitAvx2SingleBlock, explicit_single_avx2); + #[cfg(not(miri))] // vendor instrinsics are unsupported on miri. + test_processor!( + super::super::ExplicitAvx2SingleBlock, + explicit_single_avx2, + 32 + ); #[cfg(target_arch = "x86_64")] - test_block!(super::super::AutoAvx2Block<32>, auto_avx2); + test_processor!(super::super::AutoAvx2Block<32>, auto_avx2, 32); #[cfg(target_arch = "aarch64")] - test_block!(super::super::AutoNeonBlock<64>, auto_neon); + test_processor!(super::super::AutoNeonBlock<64>, auto_neon, 64); #[cfg(target_arch = "aarch64")] - test_block!(super::super::ExplicitNeon, explicit_neon); + #[cfg(not(miri))] // vendor instrinsics are unsupported on miri. + test_processor!(super::super::ExplicitNeon, explicit_neon, 32); - test_block!(super::super::AutoVanillaBlock<32>, auto); + test_processor!(super::super::AutoVanillaBlock<32>, auto, 32); }