diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie.py b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie.py
new file mode 100644
index 000000000..ce6851e30
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie.py
@@ -0,0 +1,271 @@
+# aie.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+import air
+import air.compiler.util
+from air.dialects import linalg, tensor, arith, func, memref
+from air.ir import *
+import air.passmanager
+from air.dialects import air as airdialect
+from air.compiler.util import run_transform
+import sys
+
+with air.ir.Context() as ctx, Location.unknown():
+
+    ################################################
+    ## Tiling
+    ################################################
+
+    air_tiled_ir_string = """
+    module {
+      func.func @matmul_512x512_512xbf16__dispatch_0_matmul_512x512x512_bf16(%0 : memref<512x512xbf16>, %1 : memref<512x512xbf16>, %2 : memref<512x512xbf16>) {
+        %c4 = arith.constant 4 : index
+        %c256 = arith.constant 256 : index
+        %c512 = arith.constant 512 : index
+        %c7 = arith.constant 7 : index
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : bf16
+        %c1 = arith.constant 1 : index
+        %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
+        %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
+        %alloc_1 = memref.alloc() : memref<1x4x64x64xbf16, 1 : i32>
+        %alloc_2 = memref.alloc() : memref<4x1x64x64xbf16, 1 : i32>
+        %alloc_3 = memref.alloc() : memref<4x4x16x16x4x4xbf16, 2 : i32>
+        %alloc_4 = memref.alloc() : memref<4x4x64x64xbf16, 1 : i32>
+        scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c512, %c512) step (%c256, %c256) {
+          %subview = memref.subview %2[%arg0, %arg1] [256, 256] [1, 1] : memref<512x512xbf16> to memref<256x256xbf16, strided<[512, 1], offset: ?>>
+          %subview_5 = memref.subview %0[%arg0, 0] [256, 64] [1, 1] : memref<512x512xbf16> to memref<256x64xbf16, strided<[512, 1], offset: ?>>
+          %expand_shape = memref.expand_shape %subview_5 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[512, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+          %transpose = memref.transpose %expand_shape (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_2[] [] [], %transpose[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+          %subview_6 = memref.subview %1[0, %arg1] [64, 256] [1, 1] : memref<512x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>>
+          %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+          %transpose_8 = memref.transpose %expand_shape_7 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_8[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+          scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) {
+            %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32>
+            %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>)
+            %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32>
+            %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>)
+            %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
+            linalg.fill ins(%cst : bf16) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
+            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
+            ^bb0(%in: bf16, %in_23: bf16, %out: bf16):
+              %3 = arith.mulf %in, %in_23 : bf16
+              %4 = arith.addf %out, %3 : bf16
+              linalg.yield %4 : bf16
+            }
+            scf.reduce 
+          }
+          scf.for %arg2 = %c1 to %c7 step %c1 {
+            %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg2]
+            %subview_16 = memref.subview %0[%arg0, %3] [256, 64] [1, 1] : memref<512x512xbf16> to memref<256x64xbf16, strided<[512, 1], offset: ?>>
+            %expand_shape_17 = memref.expand_shape %subview_16 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[512, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+            %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+            air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_18[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+            %subview_19 = memref.subview %1[%3, %arg1] [64, 256] [1, 1] : memref<512x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>>
+            %expand_shape_20 = memref.expand_shape %subview_19 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+            %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+            air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_21[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+            scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) {
+              %subview_22 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
+              %expand_shape_23 = memref.expand_shape %subview_22 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32>
+              %transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>
+              air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_24[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>)
+              %subview_25 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+              %expand_shape_26 = memref.expand_shape %subview_25 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32>
+              %transpose_27 = memref.transpose %expand_shape_26 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>
+              air.dma_memcpy_nd (%alloc[] [] [], %transpose_27[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>)
+              %subview_28 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
+              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_28 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
+              ^bb0(%in: bf16, %in_29: bf16, %out: bf16):
+                %4 = arith.mulf %in, %in_29 : bf16
+                %5 = arith.addf %out, %4 : bf16
+                linalg.yield %5 : bf16
+              }
+              scf.reduce 
+            }
+          }
+          %subview_9 = memref.subview %0[%arg0, 448] [256, 64] [1, 1] : memref<512x512xbf16> to memref<256x64xbf16, strided<[512, 1], offset: ?>>
+          %expand_shape_10 = memref.expand_shape %subview_9 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[512, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+          %transpose_11 = memref.transpose %expand_shape_10 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_11[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+          %subview_12 = memref.subview %1[448, %arg1] [64, 256] [1, 1] : memref<512x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>>
+          %expand_shape_13 = memref.expand_shape %subview_12 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+          %transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_14[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+          scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) {
+            %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32>
+            %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>)
+            %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32>
+            %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>)
+            %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
+            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
+            ^bb0(%in: bf16, %in_25: bf16, %out: bf16):
+              %3 = arith.mulf %in, %in_25 : bf16
+              %4 = arith.addf %out, %3 : bf16
+              linalg.yield %4 : bf16
+            }
+            %subview_23 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+            %transpose_24 = memref.transpose %subview_22 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32>
+            air.dma_memcpy_nd (%subview_23[] [] [], %transpose_24[] [] []) : (memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32>)
+            scf.reduce 
+          }
+          %transpose_15 = memref.transpose %alloc_4 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x4x64x64xbf16, 1 : i32> to memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32>
+          air.dma_memcpy_nd (%subview[] [] [], %transpose_15[] [] []) : (memref<256x256xbf16, strided<[512, 1], offset: ?>>, memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32>)
+          scf.reduce 
+        }
+        memref.dealloc %alloc_4 : memref<4x4x64x64xbf16, 1 : i32>
+        memref.dealloc %alloc_3 : memref<4x4x16x16x4x4xbf16, 2 : i32>
+        memref.dealloc %alloc_2 : memref<4x1x64x64xbf16, 1 : i32>
+        memref.dealloc %alloc_1 : memref<1x4x64x64xbf16, 1 : i32>
+        memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
+        memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
+        return
+      }
+    }
+    """
+    air_module = Module.parse(air_tiled_ir_string)
+
+    ################################################
+    ## Binding scf.paralell to air hierarchies
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "buffer-results-to-out-params",
+                "air-linalg-to-func{link-with=mm.o}",
+                "air-par-to-herd{depth=1}",
+                "air-par-to-launch{has-air-segment=true}",
+                "air-copy-to-dma",
+                "canonicalize",
+                "cse",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ###############################################
+    # Extract event dependency and optimize schedule
+    ###############################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "air-dependency",
+                "air-dependency-schedule-opt",
+                "air-specialize-dma-broadcast",
+                "air-dma-to-channel",
+                "canonicalize",
+                "cse",
+                "air-dependency-canonicalize",
+                "canonicalize",
+                "cse",
+                "air-isolate-async-dma-loop-nests",
+                "canonicalize",
+                "cse",
+                "air-fuse-channels",
+                "canonicalize",
+                "cse",
+                ### Scaling to 4 AIE columns
+                "func.func(air-split-l2-memref)",
+                "air-isolate-async-dma-loop-nests",
+                ###
+                "canonicalize",
+                "cse",
+                "func.func(air-loop-fusion)",
+                "air-label-scf-for-to-ping-pong",
+                "air-ping-pong-transform{keep-memref-dealloc=true}",
+                "canonicalize",
+                "cse",
+                "air-specialize-channel-wrap-and-stride",
+                "canonicalize",
+                "cse",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## Place herd to segment
+    ################################################
+
+    air_async_module = Module.parse(str(air_module))
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "func.func(air-collapse-herd{max-col-size=4})",
+                "canonicalize",
+                "cse",
+                "air-place-herds{num-rows=4 num-cols=4 row-anchor=2 col-anchor=0}",
+                "canonicalize",
+                "cse",
+                "func.func(air-renumber-dma)",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## MLIR-AIR to MLIR-AIE
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "canonicalize",
+                "cse",
+                "air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true use-pkt-flow-at-shim-dma=true}",
+                "canonicalize",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## MLIR-AIR runtime lowering
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "air-to-std",
+                "canonicalize",
+                "symbol-dce",
+                "func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})",
+                "func.func(air-unroll-outer-affine-loops{depth=2})",
+                "affine-expand-index-ops",
+                "airrt-to-npu",
+                "canonicalize",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+    with open("aie.mlir", "w") as f:
+        f.write(str(air_module))
diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie2.py b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie2.py
new file mode 100644
index 000000000..ca4b0b173
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie2.py
@@ -0,0 +1,272 @@
+# aie.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+import air
+import air.compiler.util
+from air.dialects import linalg, tensor, arith, func, memref
+from air.ir import *
+import air.passmanager
+from air.dialects import air as airdialect
+from air.compiler.util import run_transform
+import sys
+
+with air.ir.Context() as ctx, Location.unknown():
+
+    ################################################
+    ## Tiling
+    ################################################
+
+    air_tiled_ir_string = """
+    module {
+      func.func @matmul_512x1024_512xbf16__dispatch_0_matmul_512x1024x512_bf16(%0 : memref<512x1024xbf16>, %1 : memref<1024x512xbf16>, %2 : memref<512x512xbf16>) {
+        %c4 = arith.constant 4 : index
+        %c256 = arith.constant 256 : index
+        %c512 = arith.constant 512 : index
+        %c15 = arith.constant 15 : index
+        %c7 = arith.constant 7 : index
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : bf16
+        %c1 = arith.constant 1 : index
+        %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
+        %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
+        %alloc_1 = memref.alloc() : memref<1x4x64x64xbf16, 1 : i32>
+        %alloc_2 = memref.alloc() : memref<4x1x64x64xbf16, 1 : i32>
+        %alloc_3 = memref.alloc() : memref<4x4x16x16x4x4xbf16, 2 : i32>
+        %alloc_4 = memref.alloc() : memref<4x4x64x64xbf16, 1 : i32>
+        scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c512, %c512) step (%c256, %c256) {
+          %subview = memref.subview %2[%arg0, %arg1] [256, 256] [1, 1] : memref<512x512xbf16> to memref<256x256xbf16, strided<[512, 1], offset: ?>>
+          %subview_5 = memref.subview %0[%arg0, 0] [256, 64] [1, 1] : memref<512x1024xbf16> to memref<256x64xbf16, strided<[1024, 1], offset: ?>>
+          %expand_shape = memref.expand_shape %subview_5 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[1024, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>>
+          %transpose = memref.transpose %expand_shape (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_2[] [] [], %transpose[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>)
+          %subview_6 = memref.subview %1[0, %arg1] [64, 256] [1, 1] : memref<1024x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>>
+          %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+          %transpose_8 = memref.transpose %expand_shape_7 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_8[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+          scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) {
+            %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32>
+            %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>)
+            %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32>
+            %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>)
+            %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
+            linalg.fill ins(%cst : bf16) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
+            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
+            ^bb0(%in: bf16, %in_23: bf16, %out: bf16):
+              %3 = arith.mulf %in, %in_23 : bf16
+              %4 = arith.addf %out, %3 : bf16
+              linalg.yield %4 : bf16
+            }
+            scf.reduce 
+          }
+          scf.for %arg2 = %c1 to %c15 step %c1 {
+            %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg2]
+            %subview_16 = memref.subview %0[%arg0, %3] [256, 64] [1, 1] : memref<512x1024xbf16> to memref<256x64xbf16, strided<[1024, 1], offset: ?>>
+            %expand_shape_17 = memref.expand_shape %subview_16 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[1024, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>>
+            %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>
+            air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_18[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>)
+            %subview_19 = memref.subview %1[%3, %arg1] [64, 256] [1, 1] : memref<1024x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>>
+            %expand_shape_20 = memref.expand_shape %subview_19 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+            %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+            air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_21[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+            scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) {
+              %subview_22 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
+              %expand_shape_23 = memref.expand_shape %subview_22 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32>
+              %transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>
+              air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_24[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>)
+              %subview_25 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+              %expand_shape_26 = memref.expand_shape %subview_25 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32>
+              %transpose_27 = memref.transpose %expand_shape_26 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>
+              air.dma_memcpy_nd (%alloc[] [] [], %transpose_27[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>)
+              %subview_28 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
+              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_28 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
+              ^bb0(%in: bf16, %in_29: bf16, %out: bf16):
+                %4 = arith.mulf %in, %in_29 : bf16
+                %5 = arith.addf %out, %4 : bf16
+                linalg.yield %5 : bf16
+              }
+              scf.reduce 
+            }
+          }
+          %subview_9 = memref.subview %0[%arg0, 448] [256, 64] [1, 1] : memref<512x1024xbf16> to memref<256x64xbf16, strided<[1024, 1], offset: ?>>
+          %expand_shape_10 = memref.expand_shape %subview_9 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[1024, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>>
+          %transpose_11 = memref.transpose %expand_shape_10 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_11[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>)
+          %subview_12 = memref.subview %1[448, %arg1] [64, 256] [1, 1] : memref<1024x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>>
+          %expand_shape_13 = memref.expand_shape %subview_12 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>>
+          %transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>
+          air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_14[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>)
+          scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) {
+            %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32>
+            %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>)
+            %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+            %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32>
+            %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>
+            air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>)
+            %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
+            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
+            ^bb0(%in: bf16, %in_25: bf16, %out: bf16):
+              %3 = arith.mulf %in, %in_25 : bf16
+              %4 = arith.addf %out, %3 : bf16
+              linalg.yield %4 : bf16
+            }
+            %subview_23 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>
+            %transpose_24 = memref.transpose %subview_22 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32>
+            air.dma_memcpy_nd (%subview_23[] [] [], %transpose_24[] [] []) : (memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32>)
+            scf.reduce 
+          }
+          %transpose_15 = memref.transpose %alloc_4 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x4x64x64xbf16, 1 : i32> to memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32>
+          air.dma_memcpy_nd (%subview[] [] [], %transpose_15[] [] []) : (memref<256x256xbf16, strided<[512, 1], offset: ?>>, memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32>)
+          scf.reduce 
+        }
+        memref.dealloc %alloc_4 : memref<4x4x64x64xbf16, 1 : i32>
+        memref.dealloc %alloc_3 : memref<4x4x16x16x4x4xbf16, 2 : i32>
+        memref.dealloc %alloc_2 : memref<4x1x64x64xbf16, 1 : i32>
+        memref.dealloc %alloc_1 : memref<1x4x64x64xbf16, 1 : i32>
+        memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
+        memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
+        return
+      }
+    }
+    """
+    air_module = Module.parse(air_tiled_ir_string)
+
+    ################################################
+    ## Binding scf.paralell to air hierarchies
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "buffer-results-to-out-params",
+                "air-linalg-to-func{link-with=mm.o}",
+                "air-par-to-herd{depth=1}",
+                "air-par-to-launch{has-air-segment=true}",
+                "air-copy-to-dma",
+                "canonicalize",
+                "cse",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ###############################################
+    # Extract event dependency and optimize schedule
+    ###############################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "air-dependency",
+                "air-dependency-schedule-opt",
+                "air-specialize-dma-broadcast",
+                "air-dma-to-channel",
+                "canonicalize",
+                "cse",
+                "air-dependency-canonicalize",
+                "canonicalize",
+                "cse",
+                "air-isolate-async-dma-loop-nests",
+                "canonicalize",
+                "cse",
+                "air-fuse-channels",
+                "canonicalize",
+                "cse",
+                ### Scaling to 4 AIE columns
+                "func.func(air-split-l2-memref)",
+                "air-isolate-async-dma-loop-nests",
+                ###
+                "canonicalize",
+                "cse",
+                "func.func(air-loop-fusion)",
+                "air-label-scf-for-to-ping-pong",
+                "air-ping-pong-transform{keep-memref-dealloc=true}",
+                "canonicalize",
+                "cse",
+                "air-specialize-channel-wrap-and-stride",
+                "canonicalize",
+                "cse",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## Place herd to segment
+    ################################################
+
+    air_async_module = Module.parse(str(air_module))
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "func.func(air-collapse-herd{max-col-size=4})",
+                "canonicalize",
+                "cse",
+                "air-place-herds{num-rows=4 num-cols=4 row-anchor=2 col-anchor=0}",
+                "canonicalize",
+                "cse",
+                "func.func(air-renumber-dma)",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## MLIR-AIR to MLIR-AIE
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "canonicalize",
+                "cse",
+                "air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true use-pkt-flow-at-shim-dma=true}",
+                "canonicalize",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## MLIR-AIR runtime lowering
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "air-to-std",
+                "canonicalize",
+                "symbol-dce",
+                "func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})",
+                "func.func(air-unroll-outer-affine-loops{depth=2})",
+                "affine-expand-index-ops",
+                "airrt-to-npu",
+                "canonicalize",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+    with open("aie2.mlir", "w") as f:
+        f.write(str(air_module))
diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/base.mlir b/test/xrt/24_ctrlpkt_config_2gemms_4x4/base.mlir
new file mode 100644
index 000000000..4017f258c
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/base.mlir
@@ -0,0 +1,35 @@
+//===- base.mlir -----------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_4col){
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_1_0 = aie.tile(1, 0)
+    %tile_2_0 = aie.tile(2, 0)
+    %tile_3_0 = aie.tile(3, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_1_1 = aie.tile(1, 1)
+    %tile_2_1 = aie.tile(2, 1)
+    %tile_3_1 = aie.tile(3, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_1_2 = aie.tile(1, 2)
+    %tile_2_2 = aie.tile(2, 2)
+    %tile_3_2 = aie.tile(3, 2)
+    %tile_0_3 = aie.tile(0, 3)
+    %tile_1_3 = aie.tile(1, 3)
+    %tile_2_3 = aie.tile(2, 3)
+    %tile_3_3 = aie.tile(3, 3)
+    %tile_0_4 = aie.tile(0, 4)
+    %tile_1_4 = aie.tile(1, 4)
+    %tile_2_4 = aie.tile(2, 4)
+    %tile_3_4 = aie.tile(3, 4)
+    %tile_0_5 = aie.tile(0, 5)
+    %tile_1_5 = aie.tile(1, 5)
+    %tile_2_5 = aie.tile(2, 5)
+    %tile_3_5 = aie.tile(3, 5)
+  }
+}
diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/matrix_multiplication.h b/test/xrt/24_ctrlpkt_config_2gemms_4x4/matrix_multiplication.h
new file mode 100644
index 000000000..988426b6c
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/matrix_multiplication.h
@@ -0,0 +1,287 @@
+//===- matrix_multiplication.h ----------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// This file contains common helper functions for the matrix multiplication
+// host code, such as verifying and printing matrices.
+
+#ifndef MATRIX_MULTIPLICATION_H
+#define MATRIX_MULTIPLICATION_H
+
+#include <boost/program_options.hpp>
+#include <cmath>
+
+namespace matmul_common {
+
+namespace po = boost::program_options;
+
+// --------------------------------------------------------------------------
+// Command Line Argument Handling
+// --------------------------------------------------------------------------
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+void add_default_options(po::options_description &desc) {
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")(
+      "kernel,k", po::value<std::string>()->required(),
+      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+      "verbosity,v", po::value<int>()->default_value(0),
+      "the verbosity of the output");
+}
+
+void parse_options(int argc, const char *argv[], po::options_description &desc,
+                   po::variables_map &vm) {
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      std::exit(1);
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    std::exit(1);
+  }
+
+  check_arg_file_exists(vm, "xclbin");
+}
+
+// --------------------------------------------------------------------------
+// AIE Specifics
+// --------------------------------------------------------------------------
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+// --------------------------------------------------------------------------
+// Matrix / Float / Math
+// --------------------------------------------------------------------------
+
+static inline std::int16_t random_int16_t() {
+  return (std::int16_t)rand() % 0x10000;
+}
+
+static inline std::bfloat16_t random_bfloat16_t() {
+  // Random numbers should NOT be uniformly between 0 and 1, because that
+  // would make the matrix product AB always close to 1.
+  return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
+}
+
+template <typename Tin, typename Tout>
+void matmul_naive(int M, int N, int K, const std::vector<Tin> A,
+                  const std::vector<Tin> B, std::vector<Tout> &C) {
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col++) {
+      Tout running_sum = 0;
+      for (int k = 0; k < K; k++) {
+        running_sum += Tout(A[row * K + k] * B[k * N + col]);
+      }
+      C[row * N + col] = Tout(running_sum);
+    }
+  }
+}
+
+template <typename Tin, typename Tout>
+void matmul(int M, int N, int K, const std::vector<Tin> A,
+            const std::vector<Tin> B, std::vector<Tout> &C) {
+  // A is an  MxK matrix
+  // B is a   KxN matrix
+  // C is the MxN output matrix, assumed to be zeroed out
+
+  constexpr int K_block_size = 64;
+  const int n_K_blocks = K / K_block_size;
+
+  const Tin *B_origin = B.data(); /* Avoid a calls to B.data() within the loop
+                                     with this const variable. B does not get
+                                     resized, so the pointer remains valid. */
+
+  const Tin *A_base = A.data(); /* Points to start of current row of A,
+                                   monotonically increasing by K. */
+  const Tin *B_base = B_origin; /* Points to start of current column of B;
+                                   increases by 1 in each inner loop, resets
+                                   to B_origin (0) at the start of a new row
+                                   (outer loop). */
+
+  const Tin *A_ptr = A_base;
+  const Tin *B_ptr = B_base;
+  Tout *C_ptr = C.data(); /* Monotonically increasing by 1. */
+
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col++) {
+      A_ptr = A_base;
+      B_ptr = B_base;
+      Tout running_sum = 0;
+      for (int k = 0; k < n_K_blocks; k++) {
+        for (int i = 0; i < K_block_size; i++) {
+          running_sum += Tout(*A_ptr) * Tout(*B_ptr);
+          A_ptr += 1; // Advance to right neighbor; next value in this row
+          B_ptr += N; // Advance to bottom neighbor; next value in this column
+        }
+      }
+      *C_ptr = Tout(running_sum);
+      C_ptr += 1;
+      B_base += 1; /* Next iteration: same row of A (A_base unchanged),
+                      next column of B (B_base increases by 1) */
+    }
+    A_base += K;       // Advance to next row of A
+    B_base = B_origin; /* Next row of A means we need to restart at the first
+                          column of B. */
+  }
+}
+
+// nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0
+// Original author: P-Gn
+// Source: https://stackoverflow.com/a/32334103
+bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
+                  float abs_th = FLT_MIN)
+// those defaults are arbitrary and could be removed
+{
+  assert(std::numeric_limits<float>::epsilon() <= epsilon);
+  assert(epsilon < 1.f);
+
+  if (a == b)
+    return true;
+
+  auto diff = std::abs(a - b);
+  auto norm =
+      std::min((std::abs(a) + std::abs(b)), std::numeric_limits<float>::max());
+  // or even faster: std::min(std::abs(a + b),
+  // std::numeric_limits<float>::max()); keeping this commented out until I
+  // update figures below
+  return diff < std::max(abs_th, epsilon * norm);
+}
+
+template <typename T>
+void print_matrix(const std::vector<T> matrix, int n_cols,
+                  int n_printable_rows = 10, int n_printable_cols = 10,
+                  std::ostream &ostream = std::cout,
+                  const char col_sep[] = "  ", const char elide_sym[] = " ... ",
+                  int w = -1) {
+  assert(matrix.size() % n_cols == 0);
+
+  auto maxima = std::minmax_element(matrix.begin(), matrix.end());
+  T max_val = std::max(*maxima.first, std::abs(*maxima.second));
+  size_t n_digits = log10(max_val);
+  if (w == -1) {
+    w = n_digits;
+  }
+  int n_rows = matrix.size() / n_cols;
+
+  n_printable_rows = std::min(n_rows, n_printable_rows);
+  n_printable_cols = std::min(n_cols, n_printable_cols);
+
+  const bool elide_rows = n_printable_rows < n_rows;
+  const bool elide_cols = n_printable_cols < n_cols;
+
+  if (elide_rows || elide_cols) {
+    w = std::max((int)w, (int)strlen(elide_sym));
+  }
+
+  w += 3; // for decimal point and two decimal digits
+  ostream << std::fixed << std::setprecision(2);
+
+#define print_row(what)                                                        \
+  for (int col = 0; col < n_printable_cols / 2; col++) {                       \
+    ostream << std::right << std::setw(w) << (what);                           \
+    ostream << std::setw(0) << col_sep;                                        \
+  }                                                                            \
+  if (elide_cols) {                                                            \
+    ostream << std::setw(0) << elide_sym;                                      \
+  }                                                                            \
+  for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) {    \
+    ostream << std::right << std::setw(w) << (what);                           \
+    ostream << std::setw(0) << col_sep;                                        \
+  }
+
+  for (int row = 0; row < n_printable_rows / 2; row++) {
+    print_row(matrix[row * n_rows + col]);
+    ostream << std::endl;
+  }
+  if (elide_rows) {
+    print_row(elide_sym);
+    ostream << std::endl;
+  }
+  for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) {
+    print_row(matrix[row * n_rows + col]);
+    ostream << std::endl;
+  }
+
+#undef print_row
+}
+
+template <typename Tin, typename Tout>
+int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
+           std::vector<Tout> C) {
+  int errors = 0;
+  int max_printable_errors = 500;
+  const float absTol = 0.5;
+  const float relTol = 0.5;
+
+  std::vector<Tout> CRef(M * N);
+  matmul(M, N, K, A, B, CRef);
+
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col++) {
+      if (!nearly_equal(CRef[row * N + col], C[row * N + col], relTol,
+                        absTol)) {
+        errors++;
+        if (errors < max_printable_errors) {
+          std::cout << "Error in row " << row << ", col " << col << ". "
+                    << "Expected " << std::setw(4) << (float)CRef[row * N + col]
+                    << ", got " << std::setw(4) << (float)C[row * N + col]
+                    << "." << std::endl;
+        }
+      }
+    }
+  }
+
+  if (errors >= max_printable_errors) {
+    std::cout << "...and " << std::setw(0) << errors << " further errors."
+              << std::endl;
+  }
+  if (errors > 0) {
+    std::cout << std::endl << "Reference:" << std::endl;
+    matmul_common::print_matrix(CRef, N);
+    std::cout << std::endl << "Output:" << std::endl;
+    matmul_common::print_matrix(C, N);
+  }
+
+  return errors;
+}
+
+} // namespace matmul_common
+
+#endif
diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/mm.cc b/test/xrt/24_ctrlpkt_config_2gemms_4x4/mm.cc
new file mode 100644
index 000000000..c8639fd81
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/mm.cc
@@ -0,0 +1,340 @@
+//===- mm.cc ----------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+#include "zero.cc"
+
+template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
+          unsigned colB, unsigned r, unsigned s, unsigned t>
+void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,
+                       T_out *__restrict pC) {
+  using MMUL = aie::mmul<r, s, t, T_in, T_in, accfloat>;
+
+  event0();
+
+  for (unsigned z = 0; z < rowA; z += 2)
+    chess_loop_range(2, ) {
+      T_out *__restrict pC1 = pC + (z)*MMUL::size_C;
+      T_out *__restrict pC2 = pC + ((z + 1)) * MMUL::size_C;
+
+      for (unsigned j = 0; j < colB; j += 2)
+        chess_prepare_for_pipelining chess_loop_range(8, ) {
+          const T_in *__restrict pA1 = pA + (z)*MMUL::size_A;
+          const T_in *__restrict pA2 = pA + ((z + 1)) * MMUL::size_A;
+          const T_in *__restrict pB1 = pB + (j)*colA * MMUL::size_B;
+          const T_in *__restrict pB2 = pB + ((j + 1)) * colA * MMUL::size_B;
+          aie::vector<T_in, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1);
+          pA1 += rowA * MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2);
+          pA2 += rowA * MMUL::size_A;
+          aie::vector<T_in, MMUL::size_B> B0 = aie::load_v<MMUL::size_B>(pB1);
+          pB1 += MMUL::size_B;
+          aie::vector<T_in, MMUL::size_B> B1 = aie::load_v<MMUL::size_B>(pB2);
+          pB2 += MMUL::size_B;
+
+          aie::vector<T_out, MMUL::size_C> acc_C00 =
+              aie::load_v<MMUL::size_C>(pC1);
+          aie::vector<T_out, MMUL::size_C> acc_C01 =
+              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C10 =
+              aie::load_v<MMUL::size_C>(pC2);
+          aie::vector<T_out, MMUL::size_C> acc_C11 =
+              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C * rowA);
+
+          MMUL C00(acc_C00);
+          MMUL C01(acc_C01);
+          MMUL C10(acc_C10);
+          MMUL C11(acc_C11);
+
+          C00.mac(A0, B0);
+          C01.mac(A0, B1);
+          C10.mac(A1, B0);
+          C11.mac(A1, B1);
+
+          for (unsigned i = 1; i < colA; ++i)
+            chess_prepare_for_pipelining chess_loop_range(7, ) {
+              A0 = aie::load_v<MMUL::size_A>(pA1);
+              pA1 += rowA * MMUL::size_A;
+              A1 = aie::load_v<MMUL::size_A>(pA2);
+              pA2 += rowA * MMUL::size_A;
+              B0 = aie::load_v<MMUL::size_B>(pB1);
+              pB1 += MMUL::size_B;
+              B1 = aie::load_v<MMUL::size_B>(pB2);
+              pB2 += MMUL::size_B;
+              C00.mac(A0, B0);
+              C01.mac(A0, B1);
+              C10.mac(A1, B0);
+              C11.mac(A1, B1);
+            }
+
+          aie::store_v(pC1, C00.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+          aie::store_v(pC1, C01.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+          aie::store_v(pC2, C10.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+          aie::store_v(pC2, C11.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+        }
+    }
+
+  event1();
+}
+
+template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
+          unsigned colB, unsigned r, unsigned s, unsigned t>
+void matmul_vectorized_2x2(const T_in *__restrict pA, const T_in *__restrict pB,
+                           T_out *__restrict pC) {
+  using MMUL = aie::mmul<r, s, t, T_in, T_in, accfloat>;
+
+  event0();
+
+  for (unsigned z = 0; z < rowA; z += 4)
+    chess_loop_range(2, ) {
+      T_out *__restrict pC1 = pC + (z)*MMUL::size_C;
+      T_out *__restrict pC2 = pC + ((z + 1)) * MMUL::size_C;
+      T_out *__restrict pC3 = pC + ((z + 2)) * MMUL::size_C;
+      T_out *__restrict pC4 = pC + ((z + 3)) * MMUL::size_C;
+
+      for (unsigned j = 0; j < colB; j += 4)
+        chess_prepare_for_pipelining chess_loop_range(8, ) {
+          const T_in *__restrict pA1 = pA + (z)*MMUL::size_A;
+          const T_in *__restrict pA2 = pA + ((z + 1)) * MMUL::size_A;
+          const T_in *__restrict pA3 = pA + ((z + 2)) * MMUL::size_A;
+          const T_in *__restrict pA4 = pA + ((z + 3)) * MMUL::size_A;
+
+          const T_in *__restrict pB1 = pB + (j)*MMUL::size_B;
+          const T_in *__restrict pB2 = pB + ((j + 1)) * MMUL::size_B;
+          const T_in *__restrict pB3 = pB + ((j + 2)) * MMUL::size_B;
+          const T_in *__restrict pB4 = pB + ((j + 3)) * MMUL::size_B;
+
+          aie::vector<T_in, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1);
+          pA1 += rowA * MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2);
+          pA2 += rowA * MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A2 = aie::load_v<MMUL::size_A>(pA3);
+          pA3 += rowA * MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A3 = aie::load_v<MMUL::size_A>(pA4);
+          pA4 += rowA * MMUL::size_A;
+          aie::vector<T_in, MMUL::size_B> B0 = aie::load_v<MMUL::size_B>(pB1);
+          pB1 += MMUL::size_B;
+          aie::vector<T_in, MMUL::size_B> B1 = aie::load_v<MMUL::size_B>(pB2);
+          pB2 += MMUL::size_B;
+          aie::vector<T_in, MMUL::size_B> B2 = aie::load_v<MMUL::size_B>(pB3);
+          pB3 += MMUL::size_B;
+          aie::vector<T_in, MMUL::size_B> B3 = aie::load_v<MMUL::size_B>(pB4);
+          pB4 += MMUL::size_B;
+
+          aie::vector<T_out, MMUL::size_C> acc_C00 =
+              aie::load_v<MMUL::size_C>(pC1);
+          aie::vector<T_out, MMUL::size_C> acc_C01 =
+              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C02 =
+              aie::load_v<MMUL::size_C>(pC1 + 2 * MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C03 =
+              aie::load_v<MMUL::size_C>(pC1 + 3 * MMUL::size_C * rowA);
+
+          aie::vector<T_out, MMUL::size_C> acc_C10 =
+              aie::load_v<MMUL::size_C>(pC2);
+          aie::vector<T_out, MMUL::size_C> acc_C11 =
+              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C12 =
+              aie::load_v<MMUL::size_C>(pC2 + 2 * MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C13 =
+              aie::load_v<MMUL::size_C>(pC2 + 3 * MMUL::size_C * rowA);
+
+          aie::vector<T_out, MMUL::size_C> acc_C20 =
+              aie::load_v<MMUL::size_C>(pC3);
+          aie::vector<T_out, MMUL::size_C> acc_C21 =
+              aie::load_v<MMUL::size_C>(pC3 + MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C22 =
+              aie::load_v<MMUL::size_C>(pC3 + 2 * MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C23 =
+              aie::load_v<MMUL::size_C>(pC3 + 3 * MMUL::size_C * rowA);
+
+          aie::vector<T_out, MMUL::size_C> acc_C30 =
+              aie::load_v<MMUL::size_C>(pC4);
+          aie::vector<T_out, MMUL::size_C> acc_C31 =
+              aie::load_v<MMUL::size_C>(pC4 + MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C32 =
+              aie::load_v<MMUL::size_C>(pC4 + 2 * MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C33 =
+              aie::load_v<MMUL::size_C>(pC4 + 3 * MMUL::size_C * rowA);
+
+          MMUL C00(acc_C00);
+          MMUL C01(acc_C01);
+          MMUL C02(acc_C02);
+          MMUL C03(acc_C03);
+
+          MMUL C10(acc_C10);
+          MMUL C11(acc_C11);
+          MMUL C12(acc_C12);
+          MMUL C13(acc_C13);
+
+          MMUL C20(acc_C20);
+          MMUL C21(acc_C21);
+          MMUL C22(acc_C22);
+          MMUL C23(acc_C23);
+
+          MMUL C30(acc_C30);
+          MMUL C31(acc_C31);
+          MMUL C32(acc_C32);
+          MMUL C33(acc_C33);
+
+          C00.mac(A0, B0);
+          C01.mac(A0, B1);
+          C10.mac(A1, B0);
+          C11.mac(A1, B1);
+
+          C02.mac(A0, B2);
+          C03.mac(A0, B3);
+          C12.mac(A1, B2);
+          C13.mac(A1, B3);
+
+          C20.mac(A2, B0);
+          C21.mac(A2, B1);
+          C30.mac(A3, B0);
+          C31.mac(A3, B1);
+
+          C22.mac(A2, B2);
+          C23.mac(A2, B3);
+          C32.mac(A3, B2);
+          C33.mac(A3, B3);
+
+          for (unsigned i = 1; i < colA; ++i)
+            chess_prepare_for_pipelining chess_loop_range(7, ) {
+              A0 = aie::load_v<MMUL::size_A>(pA1);
+              pA1 += rowA * MMUL::size_A;
+              A1 = aie::load_v<MMUL::size_A>(pA2);
+              pA2 += rowA * MMUL::size_A;
+              A2 = aie::load_v<MMUL::size_A>(pA3);
+              pA3 += rowA * MMUL::size_A;
+              A3 = aie::load_v<MMUL::size_A>(pA4);
+              pA4 += rowA * MMUL::size_A;
+
+              B0 = aie::load_v<MMUL::size_B>(pB1);
+              pB1 += MMUL::size_B;
+              B1 = aie::load_v<MMUL::size_B>(pB2);
+              pB2 += MMUL::size_B;
+              B2 = aie::load_v<MMUL::size_B>(pB3);
+              pB3 += MMUL::size_B;
+              B3 = aie::load_v<MMUL::size_B>(pB4);
+              pB4 += MMUL::size_B;
+
+              C00.mac(A0, B0);
+              C01.mac(A0, B1);
+              C10.mac(A1, B0);
+              C11.mac(A1, B1);
+
+              C02.mac(A0, B2);
+              C03.mac(A0, B3);
+              C12.mac(A1, B2);
+              C13.mac(A1, B3);
+
+              C20.mac(A2, B0);
+              C21.mac(A2, B1);
+              C30.mac(A3, B0);
+              C31.mac(A3, B1);
+
+              C22.mac(A2, B2);
+              C23.mac(A2, B3);
+              C32.mac(A3, B2);
+              C33.mac(A3, B3);
+            }
+
+          aie::store_v(pC1, C00.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+          aie::store_v(pC1, C01.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+          aie::store_v(pC1, C02.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+          aie::store_v(pC1, C03.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+
+          aie::store_v(pC2, C10.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+          aie::store_v(pC2, C11.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+          aie::store_v(pC2, C12.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+          aie::store_v(pC2, C13.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+
+          aie::store_v(pC3, C20.template to_vector<T_out>());
+          pC3 += MMUL::size_C * rowA;
+          aie::store_v(pC3, C21.template to_vector<T_out>());
+          pC3 += MMUL::size_C * rowA;
+          aie::store_v(pC3, C22.template to_vector<T_out>());
+          pC3 += MMUL::size_C * rowA;
+          aie::store_v(pC3, C23.template to_vector<T_out>());
+          pC3 += MMUL::size_C * rowA;
+
+          aie::store_v(pC4, C30.template to_vector<T_out>());
+          pC4 += MMUL::size_C * rowA;
+          aie::store_v(pC4, C31.template to_vector<T_out>());
+          pC4 += MMUL::size_C * rowA;
+          aie::store_v(pC4, C32.template to_vector<T_out>());
+          pC4 += MMUL::size_C * rowA;
+          aie::store_v(pC4, C33.template to_vector<T_out>());
+          pC4 += MMUL::size_C * rowA;
+        }
+    }
+
+  event1();
+}
+
+template <unsigned m, unsigned k, unsigned n>
+void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA,
+                                       const bfloat16 *__restrict pB,
+                                       bfloat16 *__restrict pC) {
+  constexpr int r = 4;
+  constexpr int s = 8;
+  constexpr int t = 4;
+  static_assert(m % (2 * r) == 0 && m / (2 * r) > 0);
+  static_assert(k % (2 * s) == 0 && k / (2 * s) > 0);
+  static_assert(n % (2 * t) == 0 && n / (2 * t) > 0);
+  return matmul_vectorized_2x2<bfloat16, bfloat16, m / r, k / s, n / t, r, s,
+                               t>(pA, pB, pC);
+}
+
+extern "C" {
+
+#define combos(X) X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4)
+
+#define matmul_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,            \
+                                 mlir_type_out, r, s, t)                       \
+  void matmul_##mlir_type_in##_##mlir_type_out(ctype_in *a_in, ctype_in *b_in, \
+                                               ctype_out *c_out) {             \
+    matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out<      \
+        64, 64, 64>(a_in, b_in, c_out);                                        \
+  }
+
+#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,              \
+                               mlir_type_out, r, s, t)                         \
+  void linalg_fill_bf16_view1x1x16x16x4x4xbf16as2(ctype_out *c_out) {          \
+    zero_vectorized<ctype_out, 64, 64, 32>(c_out);                             \
+  }
+
+combos(matmul_vectorized_c_func) combos(zero_vectorized_c_func)
+
+} // extern "C"
diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/run.lit b/test/xrt/24_ctrlpkt_config_2gemms_4x4/run.lit
new file mode 100644
index 000000000..a18b91ab9
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/run.lit
@@ -0,0 +1,19 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o mm.o
+// RUN: %python %S/aie.py
+// RUN: %python %S/aie2.py
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --generate-ctrl-pkt-overlay --xclbin-name=base.xclbin %S/base.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --npu-insts-name=aie_run_seq.txt aie.mlir
+// RUN: aie-translate -aie-ctrlpkt-to-bin -aie-sequence-name=configure aie.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt
+// RUN: aie-opt -aie-ctrl-packet-to-dma -aie-dma-to-npu aie.mlir.prj/ctrlpkt.mlir -o ctrlpkt_dma_seq.mlir
+// RUN: aie-translate -aie-npu-instgen -aie-sequence-name=configure ctrlpkt_dma_seq.mlir -o ctrlpkt_dma_seq.txt
+// RUN: %python %S/aie2.py
+// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --npu-insts-name=aie2_run_seq.txt aie2.mlir
+// RUN: aie-translate -aie-ctrlpkt-to-bin -aie-sequence-name=configure aie2.mlir.prj/ctrlpkt.mlir -o aie2_ctrlpkt.txt
+// RUN: aie-opt -aie-ctrl-packet-to-dma -aie-dma-to-npu aie2.mlir.prj/ctrlpkt.mlir -o aie2_ctrlpkt_dma_seq.mlir
+// RUN: aie-translate -aie-npu-instgen -aie-sequence-name=configure aie2_ctrlpkt_dma_seq.mlir -o aie2_ctrlpkt_dma_seq.txt
+// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x base.xclbin -k MLIR_AIE
diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/test.cpp b/test/xrt/24_ctrlpkt_config_2gemms_4x4/test.cpp
new file mode 100644
index 000000000..faa840530
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/test.cpp
@@ -0,0 +1,366 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdfloat>
+
+#include "experimental/xrt_kernel.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "matrix_multiplication.h"
+
+constexpr int M = 512;
+constexpr int K1 = 512;
+constexpr int K2 = 1024;
+constexpr int N = 512;
+
+constexpr int A_VOLUME_1 = M * K1;
+constexpr int B_VOLUME_1 = N * K1;
+constexpr int C_VOLUME_1 = M * N;
+constexpr int A_VOLUME_2 = M * K2;
+constexpr int B_VOLUME_2 = N * K2;
+constexpr int C_VOLUME_2 = M * N;
+
+using A_DATATYPE = std::bfloat16_t;
+using B_DATATYPE = std::bfloat16_t;
+using C_DATATYPE = std::bfloat16_t;
+
+constexpr int A_SIZE_1 = (A_VOLUME_1 * sizeof(A_DATATYPE));
+constexpr int B_SIZE_1 = (B_VOLUME_1 * sizeof(B_DATATYPE));
+constexpr int C_SIZE_1 = (C_VOLUME_1 * sizeof(C_DATATYPE));
+
+constexpr int A_SIZE_2 = (A_VOLUME_2 * sizeof(A_DATATYPE));
+constexpr int B_SIZE_2 = (B_VOLUME_2 * sizeof(B_DATATYPE));
+constexpr int C_SIZE_2 = (C_VOLUME_2 * sizeof(C_DATATYPE));
+
+constexpr bool VERIFY = true;
+
+namespace po = boost::program_options;
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  matmul_common::add_default_options(desc);
+  matmul_common::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+
+  srand(time(NULL));
+
+  std::vector<uint32_t> instr1_v =
+      matmul_common::load_instr_sequence("aie_run_seq.txt");
+
+  std::vector<uint32_t> ctrlpkt_instr1_v =
+      matmul_common::load_instr_sequence("ctrlpkt_dma_seq.txt");
+
+  std::vector<uint32_t> ctrlPackets1 =
+      matmul_common::load_instr_sequence("ctrlpkt.txt");
+
+  std::vector<uint32_t> instr2_v =
+      matmul_common::load_instr_sequence("aie2_run_seq.txt");
+
+  std::vector<uint32_t> ctrlpkt_instr2_v =
+      matmul_common::load_instr_sequence("aie2_ctrlpkt_dma_seq.txt");
+
+  std::vector<uint32_t> ctrlPackets2 =
+      matmul_common::load_instr_sequence("aie2_ctrlpkt.txt");
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_ctrlpkt_instr1 =
+      xrt::bo(device, ctrlpkt_instr1_v.size() * sizeof(int),
+              XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_ctrlpkt1 = xrt::bo(device, ctrlPackets1.size() * sizeof(int32_t),
+                             XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_instr1 = xrt::bo(device, instr1_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_a1 =
+      xrt::bo(device, A_SIZE_1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_b1 =
+      xrt::bo(device, B_SIZE_1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_c1 =
+      xrt::bo(device, C_SIZE_1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+  auto bo_ctrlpkt_instr2 =
+      xrt::bo(device, ctrlpkt_instr2_v.size() * sizeof(int),
+              XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_ctrlpkt2 = xrt::bo(device, ctrlPackets2.size() * sizeof(int32_t),
+                             XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_instr2 = xrt::bo(device, instr2_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_a2 =
+      xrt::bo(device, A_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_b2 =
+      xrt::bo(device, B_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_c2 =
+      xrt::bo(device, C_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  A_DATATYPE *bufA1 = bo_a1.map<A_DATATYPE *>();
+  std::vector<A_DATATYPE> AVec1(A_VOLUME_1);
+  for (int i = 0; i < A_VOLUME_1; i++) {
+    AVec1[i] = matmul_common::random_bfloat16_t();
+  }
+  memcpy(bufA1, AVec1.data(), (AVec1.size() * sizeof(A_DATATYPE)));
+  B_DATATYPE *bufB1 = bo_b1.map<B_DATATYPE *>();
+  std::vector<B_DATATYPE> BVec1(B_VOLUME_1);
+  for (int i = 0; i < B_VOLUME_1; i++) {
+    BVec1[i] = matmul_common::random_bfloat16_t();
+  }
+  memcpy(bufB1, BVec1.data(), (BVec1.size() * sizeof(B_DATATYPE)));
+  C_DATATYPE *bufC1 = bo_c1.map<C_DATATYPE *>();
+  std::vector<C_DATATYPE> CVec1(C_VOLUME_1);
+  memcpy(bufC1, CVec1.data(), (CVec1.size() * sizeof(C_DATATYPE)));
+
+  void *bufInstr1 = bo_instr1.map<void *>();
+  memcpy(bufInstr1, instr1_v.data(), instr1_v.size() * sizeof(int));
+
+  void *bufCtrlpktInstr1 = bo_ctrlpkt_instr1.map<void *>();
+  memcpy(bufCtrlpktInstr1, ctrlpkt_instr1_v.data(),
+         ctrlpkt_instr1_v.size() * sizeof(int));
+
+  void *bufctrlpkt1 = bo_ctrlpkt1.map<void *>();
+  memcpy(bufctrlpkt1, ctrlPackets1.data(), ctrlPackets1.size() * sizeof(int));
+
+  A_DATATYPE *bufA2 = bo_a2.map<A_DATATYPE *>();
+  std::vector<A_DATATYPE> AVec2(A_VOLUME_2);
+  for (int i = 0; i < A_VOLUME_2; i++) {
+    AVec2[i] = matmul_common::random_bfloat16_t();
+  }
+  memcpy(bufA2, AVec2.data(), (AVec2.size() * sizeof(A_DATATYPE)));
+  B_DATATYPE *bufB2 = bo_b2.map<B_DATATYPE *>();
+  std::vector<B_DATATYPE> BVec2(B_VOLUME_2);
+  for (int i = 0; i < B_VOLUME_2; i++) {
+    BVec2[i] = matmul_common::random_bfloat16_t();
+  }
+  memcpy(bufB2, BVec2.data(), (BVec2.size() * sizeof(B_DATATYPE)));
+  C_DATATYPE *bufC2 = bo_c2.map<C_DATATYPE *>();
+  std::vector<C_DATATYPE> CVec2(C_VOLUME_2);
+  memcpy(bufC2, CVec2.data(), (CVec2.size() * sizeof(C_DATATYPE)));
+
+  void *bufInstr2 = bo_instr2.map<void *>();
+  memcpy(bufInstr2, instr2_v.data(), instr2_v.size() * sizeof(int));
+
+  void *bufCtrlpktInstr2 = bo_ctrlpkt_instr2.map<void *>();
+  memcpy(bufCtrlpktInstr2, ctrlpkt_instr2_v.data(),
+         ctrlpkt_instr2_v.size() * sizeof(int));
+
+  void *bufctrlpkt2 = bo_ctrlpkt2.map<void *>();
+  memcpy(bufctrlpkt2, ctrlPackets2.data(), ctrlPackets2.size() * sizeof(int));
+
+  bo_ctrlpkt_instr1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_ctrlpkt1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_instr1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_a1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_b1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_c1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  bo_ctrlpkt_instr2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_ctrlpkt2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_instr2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_a2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_b2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_c2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned num_iter = 1;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+    auto start = std::chrono::high_resolution_clock::now();
+    unsigned int opcode = 3;
+
+    // Creating a runlist to contain two seperate runs
+    xrt::runlist runlist = xrt::runlist(context);
+
+    // Run 0: configuration
+    auto run0 = xrt::run(kernel);
+    run0.set_arg(0, opcode);
+    run0.set_arg(1, bo_ctrlpkt_instr1);
+    run0.set_arg(2, ctrlpkt_instr1_v.size());
+    run0.set_arg(3, bo_ctrlpkt1);
+    run0.set_arg(4, 0);
+    run0.set_arg(5, 0);
+    run0.set_arg(6, 0);
+    run0.set_arg(7, 0);
+    // Run 1: the design
+    auto run1 = xrt::run(kernel);
+    run1.set_arg(0, opcode);
+    run1.set_arg(1, bo_instr1);
+    run1.set_arg(2, instr1_v.size());
+    run1.set_arg(3, bo_a1);
+    run1.set_arg(4, bo_b1);
+    run1.set_arg(5, bo_c1);
+    run1.set_arg(6, 0);
+    run1.set_arg(7, 0);
+
+    // Run 2: configuration
+    auto run2 = xrt::run(kernel);
+    run2.set_arg(0, opcode);
+    run2.set_arg(1, bo_ctrlpkt_instr2);
+    run2.set_arg(2, ctrlpkt_instr2_v.size());
+    run2.set_arg(3, bo_ctrlpkt2);
+    run2.set_arg(4, 0);
+    run2.set_arg(5, 0);
+    run2.set_arg(6, 0);
+    run2.set_arg(7, 0);
+    // Run 3: the design
+    auto run3 = xrt::run(kernel);
+    run3.set_arg(0, opcode);
+    run3.set_arg(1, bo_instr2);
+    run3.set_arg(2, instr2_v.size());
+    run3.set_arg(3, bo_a2);
+    run3.set_arg(4, bo_b2);
+    run3.set_arg(5, bo_c2);
+    run3.set_arg(6, 0);
+    run3.set_arg(7, 0);
+
+    // Executing and waiting on the runlist
+    runlist.add(run0);
+    runlist.add(run1);
+    runlist.add(run2);
+    runlist.add(run3);
+    runlist.execute();
+    runlist.wait();
+
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    bo_c1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    bo_c2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    memcpy(CVec1.data(), bufC1, (CVec1.size() * sizeof(C_DATATYPE)));
+    std::vector<C_DATATYPE> CVecRef1(C_VOLUME_1);
+    if (VERIFY) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying against reference matmul ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      matmul_common::matmul(M, N, K1, AVec1, BVec1, CVecRef1);
+      errors = matmul_common::verify(M, N, K1, AVec1, BVec1, CVec1);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: matmul results not verified." << std::endl;
+    }
+
+    memcpy(CVec2.data(), bufC2, (CVec2.size() * sizeof(C_DATATYPE)));
+    std::vector<C_DATATYPE> CVecRef2(C_VOLUME_2);
+    if (VERIFY) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying against reference matmul ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      matmul_common::matmul(M, N, K2, AVec2, BVec2, CVecRef2);
+      errors = matmul_common::verify(M, N, K2, AVec2, BVec2, CVec2);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: matmul results not verified." << std::endl;
+    }
+
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  std::cout << std::endl
+            << "Avg NPU matmul time: " << npu_time_total / num_iter << "us."
+            << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU matmul time: " << npu_time_min << "us." << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU matmul time: " << npu_time_max << "us." << std::endl;
+
+  if (VERIFY && !errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/zero.cc b/test/xrt/24_ctrlpkt_config_2gemms_4x4/zero.cc
new file mode 100644
index 000000000..8c13b601d
--- /dev/null
+++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/zero.cc
@@ -0,0 +1,33 @@
+//===- zero.cc --------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ZERO_CC
+#define ZERO_CC
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+template <typename T, int M, int N, int r>
+void zero_vectorized(T *__restrict c) {
+  const aie::vector<T, r> zeros = aie::zeros<T, r>();
+  const T *__restrict c_end = c + M * N;
+  for (; c + r < c_end; c += r) {
+    aie::store_v(c, zeros);
+  }
+  // Do a scalar write for any remainder not divisible by vector instruction
+  // size r
+  for (; c < c_end; c++) {
+    *c = 0;
+  }
+}
+
+#endif