diff --git a/libs/Microsoft.MixedReality.WebRTC.Native/test/Microsoft.MixedReality.WebRTC.Native.Tests.vcxproj b/libs/Microsoft.MixedReality.WebRTC.Native/test/Microsoft.MixedReality.WebRTC.Native.Tests.vcxproj
index 8d02b2341..2124ed080 100644
--- a/libs/Microsoft.MixedReality.WebRTC.Native/test/Microsoft.MixedReality.WebRTC.Native.Tests.vcxproj
+++ b/libs/Microsoft.MixedReality.WebRTC.Native/test/Microsoft.MixedReality.WebRTC.Native.Tests.vcxproj
@@ -61,6 +61,7 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="audio_track_tests.cpp" />
+    <ClCompile Include="memory_tests.cpp" />
     <ClCompile Include="peer_connection_tests.cpp" />
     <ClCompile Include="sdp_utils_tests.cpp" />
     <ClCompile Include="pch.cpp">
diff --git a/libs/Microsoft.MixedReality.WebRTC.Native/test/memory_tests.cpp b/libs/Microsoft.MixedReality.WebRTC.Native/test/memory_tests.cpp
new file mode 100644
index 000000000..a713d8e00
--- /dev/null
+++ b/libs/Microsoft.MixedReality.WebRTC.Native/test/memory_tests.cpp
@@ -0,0 +1,172 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in the project root for license
+// information.
+
+#include "pch.h"
+
+#include "interop/interop_api.h"
+
+// Test fast path of mrsMemCpyStride() when data is packed.
+TEST(MemoryUtils, MemCpyStride_Fast) {
+  std::vector<uint8_t> s, d;
+  constexpr int kWidth = 32;
+  constexpr int kStride = kWidth;
+  constexpr int kHeight = 13;
+  s.resize(kStride * kHeight);
+  d.resize(kStride * kHeight);
+  {
+    uint8_t* src = s.data();
+    for (int j = 0; j < kHeight; ++j) {
+      for (int i = 0; i < kWidth; ++i) {
+        *src++ = (rand() & 0xFF);
+      }
+    }
+  }
+  {
+    const void* const src = s.data();
+    void* const dst = d.data();
+    mrsMemCpyStride(dst, kStride, src, kStride, kWidth, kHeight);
+    // Data is contiguous
+    ASSERT_EQ(0, memcmp(src, dst, kStride * kHeight));
+  }
+}
+
+// Test slow path of mrsMemCpyStride() with stride, without changing the
+// packing.
+TEST(MemoryUtils, MemCpyStride_Stride) {
+  std::vector<uint8_t> s, d;
+  constexpr int kWidth = 29;
+  constexpr int kStride = 32;
+  constexpr int kHeight = 13;
+  s.resize(kStride * kHeight);
+  d.resize(kStride * kHeight);
+  {
+    uint8_t* src = s.data();
+    for (int j = 0; j < kHeight; ++j) {
+      for (int i = 0; i < kWidth; ++i) {
+        *src++ = (rand() & 0xFF);
+      }
+      for (int i = kWidth; i < kStride; ++i) {
+        *src++ = 0xCF;
+      }
+    }
+  }
+  {
+    const void* const src = s.data();
+    void* const dst = d.data();
+    mrsMemCpyStride(dst, kStride, src, kStride, kWidth, kHeight);
+  }
+  {
+    const uint8_t* src = s.data();
+    const uint8_t* dst = d.data();
+    for (int j = 0; j < kHeight; ++j) {
+      // Test row
+      bool row_equal = true;
+      for (int i = 0; i < kWidth; ++i) {
+        row_equal = row_equal && (*src == *dst);
+        ++src;
+        ++dst;
+      }
+      ASSERT_TRUE(row_equal);
+      // Skip row padding
+      for (int i = kWidth; i < kStride; ++i) {
+        ++src;
+        ++dst;
+      }
+    }
+  }
+}
+
+// Test slow path of mrsMemCpyStride() with stride, expanding the one existing
+// in the source buffer.
+TEST(MemoryUtils, MemCpyStride_ExpandStride) {
+  std::vector<uint8_t> s, d;
+  constexpr int kWidth = 29;
+  constexpr int kSrcStride = 32;
+  constexpr int kDstStride = 48;
+  constexpr int kHeight = 13;
+  s.resize(kSrcStride * kHeight);
+  d.resize(kDstStride * kHeight);
+  {
+    uint8_t* src = s.data();
+    for (int j = 0; j < kHeight; ++j) {
+      for (int i = 0; i < kWidth; ++i) {
+        *src++ = (rand() & 0xFF);
+      }
+      for (int i = kWidth; i < kSrcStride; ++i) {
+        *src++ = 0xCF;
+      }
+    }
+  }
+  {
+    const void* const src = s.data();
+    void* const dst = d.data();
+    mrsMemCpyStride(dst, kDstStride, src, kSrcStride, kWidth, kHeight);
+  }
+  {
+    const uint8_t* src = s.data();
+    const uint8_t* dst = d.data();
+    for (int j = 0; j < kHeight; ++j) {
+      // Test row
+      bool row_equal = true;
+      for (int i = 0; i < kWidth; ++i) {
+        row_equal = row_equal && (*src == *dst);
+        ++src;
+        ++dst;
+      }
+      ASSERT_TRUE(row_equal);
+      // Skip row padding
+      for (int i = kWidth; i < kSrcStride; ++i) {
+        ++src;
+      }
+      for (int i = kWidth; i < kDstStride; ++i) {
+        ++dst;
+      }
+    }
+  }
+}
+
+// Test slow path of mrsMemCpyStride() with stride, packing the data on output.
+TEST(MemoryUtils, MemCpyStride_StrideToPack) {
+  std::vector<uint8_t> s, d;
+  constexpr int kWidth = 29;
+  constexpr int kSrcStride = 32;
+  constexpr int kDstStride = kWidth;
+  constexpr int kHeight = 13;
+  s.resize(kSrcStride * kHeight);
+  d.resize(kDstStride * kHeight);
+  {
+    uint8_t* src = s.data();
+    for (int j = 0; j < kHeight; ++j) {
+      for (int i = 0; i < kWidth; ++i) {
+        *src++ = (rand() & 0xFF);
+      }
+      for (int i = kWidth; i < kSrcStride; ++i) {
+        *src++ = 0xCF;
+      }
+    }
+  }
+  {
+    const void* const src = s.data();
+    void* const dst = d.data();
+    mrsMemCpyStride(dst, kDstStride, src, kSrcStride, kWidth, kHeight);
+  }
+  {
+    const uint8_t* src = s.data();
+    const uint8_t* dst = d.data();
+    for (int j = 0; j < kHeight; ++j) {
+      // Test row
+      bool row_equal = true;
+      for (int i = 0; i < kWidth; ++i) {
+        row_equal = row_equal && (*src == *dst);
+        ++src;
+        ++dst;
+      }
+      ASSERT_TRUE(row_equal);
+      // Skip row padding
+      for (int i = kWidth; i < kSrcStride; ++i) {
+        ++src;
+      }
+    }
+  }
+}
diff --git a/libs/Microsoft.MixedReality.WebRTC/VideoFrame.cs b/libs/Microsoft.MixedReality.WebRTC/VideoFrame.cs
index f116fe356..5a5aa6e6b 100644
--- a/libs/Microsoft.MixedReality.WebRTC/VideoFrame.cs
+++ b/libs/Microsoft.MixedReality.WebRTC/VideoFrame.cs
@@ -78,22 +78,25 @@ public void CopyTo(byte[] buffer)
             {
                 fixed (void* ptr = buffer)
                 {
+                    // Destination buffer is packed and contiguous
+                    ulong dstSizeYA = (ulong)width * height;
+                    ulong dstSizeUV = dstSizeYA / 4;
+                    int dstStrideYA = (int)width;
+                    int dstStrideUV = dstStrideYA / 2;
+
                     // Note : System.Buffer.MemoryCopy() essentially does the same (without stride), but gets transpiled by IL2CPP
                     // into the C++ corresponding to the IL instead of a single memcpy() call. This results in a large overhead,
                     // especially in Debug config where one can lose 5-10 FPS just because of this.
                     void* dst = ptr;
-                    ulong sizeY = (ulong)strideY * height;
-                    Utils.MemCpyStride(dst, strideY, (void*)dataY, strideY, (int)width, (int)height);
-                    dst = (void*)((ulong)dst + sizeY);
-                    ulong sizeU = (ulong)strideU * height / 2;
-                    Utils.MemCpyStride(dst, strideU, (void*)dataU, strideU, (int)width / 2, (int)height / 2);
-                    dst = (void*)((ulong)dst + sizeU);
-                    ulong sizeV = (ulong)strideV * height / 2;
-                    Utils.MemCpyStride(dst, strideV, (void*)dataV, strideV, (int)width / 2, (int)height / 2);
+                    Utils.MemCpyStride(dst, dstStrideYA, (void*)dataY, strideY, (int)width, (int)height);
+                    dst = (void*)((ulong)dst + dstSizeYA);
+                    Utils.MemCpyStride(dst, dstStrideUV, (void*)dataU, strideU, (int)width / 2, (int)height / 2);
+                    dst = (void*)((ulong)dst + dstSizeUV);
+                    Utils.MemCpyStride(dst, dstStrideUV, (void*)dataV, strideV, (int)width / 2, (int)height / 2);
                     if (dataA.ToPointer() != null)
                     {
-                        dst = (void*)((ulong)dst + sizeV);
-                        Utils.MemCpyStride(dst, strideA, (void*)dataA, strideA, (int)width, (int)height);
+                        dst = (void*)((ulong)dst + dstSizeUV);
+                        Utils.MemCpyStride(dst, dstStrideYA, (void*)dataA, strideA, (int)width, (int)height);
                     }
                 }
             }