diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie.py b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie.py new file mode 100644 index 000000000..ce6851e30 --- /dev/null +++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie.py @@ -0,0 +1,271 @@ +# aie.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +import air +import air.compiler.util +from air.dialects import linalg, tensor, arith, func, memref +from air.ir import * +import air.passmanager +from air.dialects import air as airdialect +from air.compiler.util import run_transform +import sys + +with air.ir.Context() as ctx, Location.unknown(): + + ################################################ + ## Tiling + ################################################ + + air_tiled_ir_string = """ + module { + func.func @matmul_512x512_512xbf16__dispatch_0_matmul_512x512x512_bf16(%0 : memref<512x512xbf16>, %1 : memref<512x512xbf16>, %2 : memref<512x512xbf16>) { + %c4 = arith.constant 4 : index + %c256 = arith.constant 256 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : bf16 + %c1 = arith.constant 1 : index + %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32> + %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x4x64x64xbf16, 1 : i32> + %alloc_2 = memref.alloc() : memref<4x1x64x64xbf16, 1 : i32> + %alloc_3 = memref.alloc() : memref<4x4x16x16x4x4xbf16, 2 : i32> + %alloc_4 = memref.alloc() : memref<4x4x64x64xbf16, 1 : i32> + scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c512, %c512) step (%c256, %c256) { + %subview = memref.subview %2[%arg0, %arg1] [256, 256] [1, 1] : memref<512x512xbf16> to memref<256x256xbf16, strided<[512, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg0, 0] [256, 64] [1, 1] : memref<512x512xbf16> to memref<256x64xbf16, strided<[512, 1], offset: ?>> + %expand_shape = memref.expand_shape %subview_5 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[512, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose = memref.transpose %expand_shape (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_2[] [] [], %transpose[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + %subview_6 = memref.subview %1[0, %arg1] [64, 256] [1, 1] : memref<512x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_8 = memref.transpose %expand_shape_7 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_8[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) { + %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> + %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>) + %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> + %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>) + %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> + linalg.fill ins(%cst : bf16) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) + linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) { + ^bb0(%in: bf16, %in_23: bf16, %out: bf16): + %3 = arith.mulf %in, %in_23 : bf16 + %4 = arith.addf %out, %3 : bf16 + linalg.yield %4 : bf16 + } + scf.reduce + } + scf.for %arg2 = %c1 to %c7 step %c1 { + %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg2] + %subview_16 = memref.subview %0[%arg0, %3] [256, 64] [1, 1] : memref<512x512xbf16> to memref<256x64xbf16, strided<[512, 1], offset: ?>> + %expand_shape_17 = memref.expand_shape %subview_16 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[512, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_18[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + %subview_19 = memref.subview %1[%3, %arg1] [64, 256] [1, 1] : memref<512x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>> + %expand_shape_20 = memref.expand_shape %subview_19 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_21[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) { + %subview_22 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_23 = memref.expand_shape %subview_22 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> + %transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_24[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>) + %subview_25 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_26 = memref.expand_shape %subview_25 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> + %transpose_27 = memref.transpose %expand_shape_26 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %transpose_27[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>) + %subview_28 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> + linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_28 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) { + ^bb0(%in: bf16, %in_29: bf16, %out: bf16): + %4 = arith.mulf %in, %in_29 : bf16 + %5 = arith.addf %out, %4 : bf16 + linalg.yield %5 : bf16 + } + scf.reduce + } + } + %subview_9 = memref.subview %0[%arg0, 448] [256, 64] [1, 1] : memref<512x512xbf16> to memref<256x64xbf16, strided<[512, 1], offset: ?>> + %expand_shape_10 = memref.expand_shape %subview_9 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[512, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_11 = memref.transpose %expand_shape_10 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_11[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + %subview_12 = memref.subview %1[448, %arg1] [64, 256] [1, 1] : memref<512x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>> + %expand_shape_13 = memref.expand_shape %subview_12 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_14[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) { + %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> + %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>) + %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> + %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>) + %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> + linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) { + ^bb0(%in: bf16, %in_25: bf16, %out: bf16): + %3 = arith.mulf %in, %in_25 : bf16 + %4 = arith.addf %out, %3 : bf16 + linalg.yield %4 : bf16 + } + %subview_23 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %transpose_24 = memref.transpose %subview_22 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32> + air.dma_memcpy_nd (%subview_23[] [] [], %transpose_24[] [] []) : (memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32>) + scf.reduce + } + %transpose_15 = memref.transpose %alloc_4 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x4x64x64xbf16, 1 : i32> to memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32> + air.dma_memcpy_nd (%subview[] [] [], %transpose_15[] [] []) : (memref<256x256xbf16, strided<[512, 1], offset: ?>>, memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32>) + scf.reduce + } + memref.dealloc %alloc_4 : memref<4x4x64x64xbf16, 1 : i32> + memref.dealloc %alloc_3 : memref<4x4x16x16x4x4xbf16, 2 : i32> + memref.dealloc %alloc_2 : memref<4x1x64x64xbf16, 1 : i32> + memref.dealloc %alloc_1 : memref<1x4x64x64xbf16, 1 : i32> + memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32> + memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32> + return + } + } + """ + air_module = Module.parse(air_tiled_ir_string) + + ################################################ + ## Binding scf.paralell to air hierarchies + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "buffer-results-to-out-params", + "air-linalg-to-func{link-with=mm.o}", + "air-par-to-herd{depth=1}", + "air-par-to-launch{has-air-segment=true}", + "air-copy-to-dma", + "canonicalize", + "cse", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ############################################### + # Extract event dependency and optimize schedule + ############################################### + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-dependency", + "air-dependency-schedule-opt", + "air-specialize-dma-broadcast", + "air-dma-to-channel", + "canonicalize", + "cse", + "air-dependency-canonicalize", + "canonicalize", + "cse", + "air-isolate-async-dma-loop-nests", + "canonicalize", + "cse", + "air-fuse-channels", + "canonicalize", + "cse", + ### Scaling to 4 AIE columns + "func.func(air-split-l2-memref)", + "air-isolate-async-dma-loop-nests", + ### + "canonicalize", + "cse", + "func.func(air-loop-fusion)", + "air-label-scf-for-to-ping-pong", + "air-ping-pong-transform{keep-memref-dealloc=true}", + "canonicalize", + "cse", + "air-specialize-channel-wrap-and-stride", + "canonicalize", + "cse", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## Place herd to segment + ################################################ + + air_async_module = Module.parse(str(air_module)) + pipeline = ( + "builtin.module(" + + ",".join( + [ + "func.func(air-collapse-herd{max-col-size=4})", + "canonicalize", + "cse", + "air-place-herds{num-rows=4 num-cols=4 row-anchor=2 col-anchor=0}", + "canonicalize", + "cse", + "func.func(air-renumber-dma)", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## MLIR-AIR to MLIR-AIE + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "canonicalize", + "cse", + "air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true use-pkt-flow-at-shim-dma=true}", + "canonicalize", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## MLIR-AIR runtime lowering + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-to-std", + "canonicalize", + "symbol-dce", + "func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})", + "func.func(air-unroll-outer-affine-loops{depth=2})", + "affine-expand-index-ops", + "airrt-to-npu", + "canonicalize", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + with open("aie.mlir", "w") as f: + f.write(str(air_module)) diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie2.py b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie2.py new file mode 100644 index 000000000..ca4b0b173 --- /dev/null +++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/aie2.py @@ -0,0 +1,272 @@ +# aie.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +import air +import air.compiler.util +from air.dialects import linalg, tensor, arith, func, memref +from air.ir import * +import air.passmanager +from air.dialects import air as airdialect +from air.compiler.util import run_transform +import sys + +with air.ir.Context() as ctx, Location.unknown(): + + ################################################ + ## Tiling + ################################################ + + air_tiled_ir_string = """ + module { + func.func @matmul_512x1024_512xbf16__dispatch_0_matmul_512x1024x512_bf16(%0 : memref<512x1024xbf16>, %1 : memref<1024x512xbf16>, %2 : memref<512x512xbf16>) { + %c4 = arith.constant 4 : index + %c256 = arith.constant 256 : index + %c512 = arith.constant 512 : index + %c15 = arith.constant 15 : index + %c7 = arith.constant 7 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : bf16 + %c1 = arith.constant 1 : index + %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32> + %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x4x64x64xbf16, 1 : i32> + %alloc_2 = memref.alloc() : memref<4x1x64x64xbf16, 1 : i32> + %alloc_3 = memref.alloc() : memref<4x4x16x16x4x4xbf16, 2 : i32> + %alloc_4 = memref.alloc() : memref<4x4x64x64xbf16, 1 : i32> + scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c512, %c512) step (%c256, %c256) { + %subview = memref.subview %2[%arg0, %arg1] [256, 256] [1, 1] : memref<512x512xbf16> to memref<256x256xbf16, strided<[512, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg0, 0] [256, 64] [1, 1] : memref<512x1024xbf16> to memref<256x64xbf16, strided<[1024, 1], offset: ?>> + %expand_shape = memref.expand_shape %subview_5 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[1024, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> + %transpose = memref.transpose %expand_shape (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_2[] [] [], %transpose[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>) + %subview_6 = memref.subview %1[0, %arg1] [64, 256] [1, 1] : memref<1024x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_8 = memref.transpose %expand_shape_7 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_8[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) { + %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> + %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>) + %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> + %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>) + %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> + linalg.fill ins(%cst : bf16) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) + linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) { + ^bb0(%in: bf16, %in_23: bf16, %out: bf16): + %3 = arith.mulf %in, %in_23 : bf16 + %4 = arith.addf %out, %3 : bf16 + linalg.yield %4 : bf16 + } + scf.reduce + } + scf.for %arg2 = %c1 to %c15 step %c1 { + %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg2] + %subview_16 = memref.subview %0[%arg0, %3] [256, 64] [1, 1] : memref<512x1024xbf16> to memref<256x64xbf16, strided<[1024, 1], offset: ?>> + %expand_shape_17 = memref.expand_shape %subview_16 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[1024, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> + %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_18[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>) + %subview_19 = memref.subview %1[%3, %arg1] [64, 256] [1, 1] : memref<1024x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>> + %expand_shape_20 = memref.expand_shape %subview_19 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_21[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) { + %subview_22 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_23 = memref.expand_shape %subview_22 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> + %transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_24[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>) + %subview_25 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_26 = memref.expand_shape %subview_25 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> + %transpose_27 = memref.transpose %expand_shape_26 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %transpose_27[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>) + %subview_28 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> + linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_28 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) { + ^bb0(%in: bf16, %in_29: bf16, %out: bf16): + %4 = arith.mulf %in, %in_29 : bf16 + %5 = arith.addf %out, %4 : bf16 + linalg.yield %5 : bf16 + } + scf.reduce + } + } + %subview_9 = memref.subview %0[%arg0, 448] [256, 64] [1, 1] : memref<512x1024xbf16> to memref<256x64xbf16, strided<[1024, 1], offset: ?>> + %expand_shape_10 = memref.expand_shape %subview_9 [[0, 1], [2, 3]] output_shape [4, 64, 1, 64] : memref<256x64xbf16, strided<[1024, 1], offset: ?>> into memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> + %transpose_11 = memref.transpose %expand_shape_10 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x64x1x64xbf16, strided<[65536, 1024, 64, 1], offset: ?>> to memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_2[] [] [], %transpose_11[] [] []) : (memref<4x1x64x64xbf16, 1 : i32>, memref<4x1x64x64xbf16, strided<[65536, 64, 1024, 1], offset: ?>>) + %subview_12 = memref.subview %1[448, %arg1] [64, 256] [1, 1] : memref<1024x512xbf16> to memref<64x256xbf16, strided<[512, 1], offset: ?>> + %expand_shape_13 = memref.expand_shape %subview_12 [[0, 1], [2, 3]] output_shape [1, 64, 4, 64] : memref<64x256xbf16, strided<[512, 1], offset: ?>> into memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> + %transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<1x64x4x64xbf16, strided<[32768, 512, 64, 1], offset: ?>> to memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>> + air.dma_memcpy_nd (%alloc_1[] [] [], %transpose_14[] [] []) : (memref<1x4x64x64xbf16, 1 : i32>, memref<1x4x64x64xbf16, strided<[32768, 64, 512, 1], offset: ?>>) + scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c4, %c4) step (%c1, %c1) { + %subview_16 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_17 = memref.expand_shape %subview_16 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 16, 4, 8, 8] : memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> + %transpose_18 = memref.transpose %expand_shape_17 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x16x4x8x8xbf16, strided<[4096, 4096, 256, 64, 8, 1], offset: ?>, 1 : i32> to memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc_0[] [] [], %transpose_18[] [] []) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x8x16x4x8xbf16, strided<[4096, 4096, 8, 256, 64, 1], offset: ?>, 1 : i32>) + %subview_19 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %expand_shape_20 = memref.expand_shape %subview_19 [[0], [1], [2, 3], [4, 5]] output_shape [1, 1, 8, 8, 16, 4] : memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> into memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> + %transpose_21 = memref.transpose %expand_shape_20 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x8x16x4xbf16, strided<[16384, 4096, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %transpose_21[] [] []) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, strided<[16384, 4096, 4, 512, 64, 1], offset: ?>, 1 : i32>) + %subview_22 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<4x4x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> + linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], library_call = "matmul_bf16_bf16"} ins(%alloc_0, %alloc : memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<1x1x16x8x8x4xbf16, 2 : i32>) outs(%subview_22 : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) { + ^bb0(%in: bf16, %in_25: bf16, %out: bf16): + %3 = arith.mulf %in, %in_25 : bf16 + %4 = arith.addf %out, %3 : bf16 + linalg.yield %4 : bf16 + } + %subview_23 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<4x4x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> + %transpose_24 = memref.transpose %subview_22 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x16x16x4x4xbf16, strided<[16384, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32> + air.dma_memcpy_nd (%subview_23[] [] [], %transpose_24[] [] []) : (memref<1x1x64x64xbf16, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x16x4x16x4xbf16, strided<[16384, 4096, 16, 4, 256, 1], offset: ?>, 2 : i32>) + scf.reduce + } + %transpose_15 = memref.transpose %alloc_4 (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<4x4x64x64xbf16, 1 : i32> to memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32> + air.dma_memcpy_nd (%subview[] [] [], %transpose_15[] [] []) : (memref<256x256xbf16, strided<[512, 1], offset: ?>>, memref<4x64x4x64xbf16, strided<[16384, 64, 4096, 1]>, 1 : i32>) + scf.reduce + } + memref.dealloc %alloc_4 : memref<4x4x64x64xbf16, 1 : i32> + memref.dealloc %alloc_3 : memref<4x4x16x16x4x4xbf16, 2 : i32> + memref.dealloc %alloc_2 : memref<4x1x64x64xbf16, 1 : i32> + memref.dealloc %alloc_1 : memref<1x4x64x64xbf16, 1 : i32> + memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32> + memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32> + return + } + } + """ + air_module = Module.parse(air_tiled_ir_string) + + ################################################ + ## Binding scf.paralell to air hierarchies + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "buffer-results-to-out-params", + "air-linalg-to-func{link-with=mm.o}", + "air-par-to-herd{depth=1}", + "air-par-to-launch{has-air-segment=true}", + "air-copy-to-dma", + "canonicalize", + "cse", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ############################################### + # Extract event dependency and optimize schedule + ############################################### + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-dependency", + "air-dependency-schedule-opt", + "air-specialize-dma-broadcast", + "air-dma-to-channel", + "canonicalize", + "cse", + "air-dependency-canonicalize", + "canonicalize", + "cse", + "air-isolate-async-dma-loop-nests", + "canonicalize", + "cse", + "air-fuse-channels", + "canonicalize", + "cse", + ### Scaling to 4 AIE columns + "func.func(air-split-l2-memref)", + "air-isolate-async-dma-loop-nests", + ### + "canonicalize", + "cse", + "func.func(air-loop-fusion)", + "air-label-scf-for-to-ping-pong", + "air-ping-pong-transform{keep-memref-dealloc=true}", + "canonicalize", + "cse", + "air-specialize-channel-wrap-and-stride", + "canonicalize", + "cse", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## Place herd to segment + ################################################ + + air_async_module = Module.parse(str(air_module)) + pipeline = ( + "builtin.module(" + + ",".join( + [ + "func.func(air-collapse-herd{max-col-size=4})", + "canonicalize", + "cse", + "air-place-herds{num-rows=4 num-cols=4 row-anchor=2 col-anchor=0}", + "canonicalize", + "cse", + "func.func(air-renumber-dma)", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## MLIR-AIR to MLIR-AIE + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "canonicalize", + "cse", + "air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true use-pkt-flow-at-shim-dma=true}", + "canonicalize", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## MLIR-AIR runtime lowering + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-to-std", + "canonicalize", + "symbol-dce", + "func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})", + "func.func(air-unroll-outer-affine-loops{depth=2})", + "affine-expand-index-ops", + "airrt-to-npu", + "canonicalize", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + with open("aie2.mlir", "w") as f: + f.write(str(air_module)) diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/matrix_multiplication.h b/test/xrt/24_ctrlpkt_config_2gemms_4x4/matrix_multiplication.h new file mode 100644 index 000000000..988426b6c --- /dev/null +++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/matrix_multiplication.h @@ -0,0 +1,287 @@ +//===- matrix_multiplication.h ----------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// This file contains common helper functions for the matrix multiplication +// host code, such as verifying and printing matrices. + +#ifndef MATRIX_MULTIPLICATION_H +#define MATRIX_MULTIPLICATION_H + +#include +#include + +namespace matmul_common { + +namespace po = boost::program_options; + +// -------------------------------------------------------------------------- +// Command Line Argument Handling +// -------------------------------------------------------------------------- + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +void add_default_options(po::options_description &desc) { + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output"); +} + +void parse_options(int argc, const char *argv[], po::options_description &desc, + po::variables_map &vm) { + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + std::exit(1); + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + std::exit(1); + } + + check_arg_file_exists(vm, "xclbin"); +} + +// -------------------------------------------------------------------------- +// AIE Specifics +// -------------------------------------------------------------------------- + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +// -------------------------------------------------------------------------- +// Matrix / Float / Math +// -------------------------------------------------------------------------- + +static inline std::int16_t random_int16_t() { + return (std::int16_t)rand() % 0x10000; +} + +static inline std::bfloat16_t random_bfloat16_t() { + // Random numbers should NOT be uniformly between 0 and 1, because that + // would make the matrix product AB always close to 1. + return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); +} + +template +void matmul_naive(int M, int N, int K, const std::vector A, + const std::vector B, std::vector &C) { + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col++) { + Tout running_sum = 0; + for (int k = 0; k < K; k++) { + running_sum += Tout(A[row * K + k] * B[k * N + col]); + } + C[row * N + col] = Tout(running_sum); + } + } +} + +template +void matmul(int M, int N, int K, const std::vector A, + const std::vector B, std::vector &C) { + // A is an MxK matrix + // B is a KxN matrix + // C is the MxN output matrix, assumed to be zeroed out + + constexpr int K_block_size = 64; + const int n_K_blocks = K / K_block_size; + + const Tin *B_origin = B.data(); /* Avoid a calls to B.data() within the loop + with this const variable. B does not get + resized, so the pointer remains valid. */ + + const Tin *A_base = A.data(); /* Points to start of current row of A, + monotonically increasing by K. */ + const Tin *B_base = B_origin; /* Points to start of current column of B; + increases by 1 in each inner loop, resets + to B_origin (0) at the start of a new row + (outer loop). */ + + const Tin *A_ptr = A_base; + const Tin *B_ptr = B_base; + Tout *C_ptr = C.data(); /* Monotonically increasing by 1. */ + + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col++) { + A_ptr = A_base; + B_ptr = B_base; + Tout running_sum = 0; + for (int k = 0; k < n_K_blocks; k++) { + for (int i = 0; i < K_block_size; i++) { + running_sum += Tout(*A_ptr) * Tout(*B_ptr); + A_ptr += 1; // Advance to right neighbor; next value in this row + B_ptr += N; // Advance to bottom neighbor; next value in this column + } + } + *C_ptr = Tout(running_sum); + C_ptr += 1; + B_base += 1; /* Next iteration: same row of A (A_base unchanged), + next column of B (B_base increases by 1) */ + } + A_base += K; // Advance to next row of A + B_base = B_origin; /* Next row of A means we need to restart at the first + column of B. */ + } +} + +// nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0 +// Original author: P-Gn +// Source: https://stackoverflow.com/a/32334103 +bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, + float abs_th = FLT_MIN) +// those defaults are arbitrary and could be removed +{ + assert(std::numeric_limits::epsilon() <= epsilon); + assert(epsilon < 1.f); + + if (a == b) + return true; + + auto diff = std::abs(a - b); + auto norm = + std::min((std::abs(a) + std::abs(b)), std::numeric_limits::max()); + // or even faster: std::min(std::abs(a + b), + // std::numeric_limits::max()); keeping this commented out until I + // update figures below + return diff < std::max(abs_th, epsilon * norm); +} + +template +void print_matrix(const std::vector matrix, int n_cols, + int n_printable_rows = 10, int n_printable_cols = 10, + std::ostream &ostream = std::cout, + const char col_sep[] = " ", const char elide_sym[] = " ... ", + int w = -1) { + assert(matrix.size() % n_cols == 0); + + auto maxima = std::minmax_element(matrix.begin(), matrix.end()); + T max_val = std::max(*maxima.first, std::abs(*maxima.second)); + size_t n_digits = log10(max_val); + if (w == -1) { + w = n_digits; + } + int n_rows = matrix.size() / n_cols; + + n_printable_rows = std::min(n_rows, n_printable_rows); + n_printable_cols = std::min(n_cols, n_printable_cols); + + const bool elide_rows = n_printable_rows < n_rows; + const bool elide_cols = n_printable_cols < n_cols; + + if (elide_rows || elide_cols) { + w = std::max((int)w, (int)strlen(elide_sym)); + } + + w += 3; // for decimal point and two decimal digits + ostream << std::fixed << std::setprecision(2); + +#define print_row(what) \ + for (int col = 0; col < n_printable_cols / 2; col++) { \ + ostream << std::right << std::setw(w) << (what); \ + ostream << std::setw(0) << col_sep; \ + } \ + if (elide_cols) { \ + ostream << std::setw(0) << elide_sym; \ + } \ + for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) { \ + ostream << std::right << std::setw(w) << (what); \ + ostream << std::setw(0) << col_sep; \ + } + + for (int row = 0; row < n_printable_rows / 2; row++) { + print_row(matrix[row * n_rows + col]); + ostream << std::endl; + } + if (elide_rows) { + print_row(elide_sym); + ostream << std::endl; + } + for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) { + print_row(matrix[row * n_rows + col]); + ostream << std::endl; + } + +#undef print_row +} + +template +int verify(int M, int N, int K, std::vector A, std::vector B, + std::vector C) { + int errors = 0; + int max_printable_errors = 500; + const float absTol = 0.5; + const float relTol = 0.5; + + std::vector CRef(M * N); + matmul(M, N, K, A, B, CRef); + + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col++) { + if (!nearly_equal(CRef[row * N + col], C[row * N + col], relTol, + absTol)) { + errors++; + if (errors < max_printable_errors) { + std::cout << "Error in row " << row << ", col " << col << ". " + << "Expected " << std::setw(4) << (float)CRef[row * N + col] + << ", got " << std::setw(4) << (float)C[row * N + col] + << "." << std::endl; + } + } + } + } + + if (errors >= max_printable_errors) { + std::cout << "...and " << std::setw(0) << errors << " further errors." + << std::endl; + } + if (errors > 0) { + std::cout << std::endl << "Reference:" << std::endl; + matmul_common::print_matrix(CRef, N); + std::cout << std::endl << "Output:" << std::endl; + matmul_common::print_matrix(C, N); + } + + return errors; +} + +} // namespace matmul_common + +#endif diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/mm.cc b/test/xrt/24_ctrlpkt_config_2gemms_4x4/mm.cc new file mode 100644 index 000000000..c8639fd81 --- /dev/null +++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/mm.cc @@ -0,0 +1,340 @@ +//===- mm.cc ----------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define __AIENGINE__ 2 +#define NOCPP +#define __AIEARCH__ 20 + +#include +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include + +#include "zero.cc" + +template +void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB, + T_out *__restrict pC) { + using MMUL = aie::mmul; + + event0(); + + for (unsigned z = 0; z < rowA; z += 2) + chess_loop_range(2, ) { + T_out *__restrict pC1 = pC + (z)*MMUL::size_C; + T_out *__restrict pC2 = pC + ((z + 1)) * MMUL::size_C; + + for (unsigned j = 0; j < colB; j += 2) + chess_prepare_for_pipelining chess_loop_range(8, ) { + const T_in *__restrict pA1 = pA + (z)*MMUL::size_A; + const T_in *__restrict pA2 = pA + ((z + 1)) * MMUL::size_A; + const T_in *__restrict pB1 = pB + (j)*colA * MMUL::size_B; + const T_in *__restrict pB2 = pB + ((j + 1)) * colA * MMUL::size_B; + aie::vector A0 = aie::load_v(pA1); + pA1 += rowA * MMUL::size_A; + aie::vector A1 = aie::load_v(pA2); + pA2 += rowA * MMUL::size_A; + aie::vector B0 = aie::load_v(pB1); + pB1 += MMUL::size_B; + aie::vector B1 = aie::load_v(pB2); + pB2 += MMUL::size_B; + + aie::vector acc_C00 = + aie::load_v(pC1); + aie::vector acc_C01 = + aie::load_v(pC1 + MMUL::size_C * rowA); + aie::vector acc_C10 = + aie::load_v(pC2); + aie::vector acc_C11 = + aie::load_v(pC2 + MMUL::size_C * rowA); + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C10(acc_C10); + MMUL C11(acc_C11); + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + + for (unsigned i = 1; i < colA; ++i) + chess_prepare_for_pipelining chess_loop_range(7, ) { + A0 = aie::load_v(pA1); + pA1 += rowA * MMUL::size_A; + A1 = aie::load_v(pA2); + pA2 += rowA * MMUL::size_A; + B0 = aie::load_v(pB1); + pB1 += MMUL::size_B; + B1 = aie::load_v(pB2); + pB2 += MMUL::size_B; + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + } + + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C * rowA; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C * rowA; + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C * rowA; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C * rowA; + } + } + + event1(); +} + +template +void matmul_vectorized_2x2(const T_in *__restrict pA, const T_in *__restrict pB, + T_out *__restrict pC) { + using MMUL = aie::mmul; + + event0(); + + for (unsigned z = 0; z < rowA; z += 4) + chess_loop_range(2, ) { + T_out *__restrict pC1 = pC + (z)*MMUL::size_C; + T_out *__restrict pC2 = pC + ((z + 1)) * MMUL::size_C; + T_out *__restrict pC3 = pC + ((z + 2)) * MMUL::size_C; + T_out *__restrict pC4 = pC + ((z + 3)) * MMUL::size_C; + + for (unsigned j = 0; j < colB; j += 4) + chess_prepare_for_pipelining chess_loop_range(8, ) { + const T_in *__restrict pA1 = pA + (z)*MMUL::size_A; + const T_in *__restrict pA2 = pA + ((z + 1)) * MMUL::size_A; + const T_in *__restrict pA3 = pA + ((z + 2)) * MMUL::size_A; + const T_in *__restrict pA4 = pA + ((z + 3)) * MMUL::size_A; + + const T_in *__restrict pB1 = pB + (j)*MMUL::size_B; + const T_in *__restrict pB2 = pB + ((j + 1)) * MMUL::size_B; + const T_in *__restrict pB3 = pB + ((j + 2)) * MMUL::size_B; + const T_in *__restrict pB4 = pB + ((j + 3)) * MMUL::size_B; + + aie::vector A0 = aie::load_v(pA1); + pA1 += rowA * MMUL::size_A; + aie::vector A1 = aie::load_v(pA2); + pA2 += rowA * MMUL::size_A; + aie::vector A2 = aie::load_v(pA3); + pA3 += rowA * MMUL::size_A; + aie::vector A3 = aie::load_v(pA4); + pA4 += rowA * MMUL::size_A; + aie::vector B0 = aie::load_v(pB1); + pB1 += MMUL::size_B; + aie::vector B1 = aie::load_v(pB2); + pB2 += MMUL::size_B; + aie::vector B2 = aie::load_v(pB3); + pB3 += MMUL::size_B; + aie::vector B3 = aie::load_v(pB4); + pB4 += MMUL::size_B; + + aie::vector acc_C00 = + aie::load_v(pC1); + aie::vector acc_C01 = + aie::load_v(pC1 + MMUL::size_C * rowA); + aie::vector acc_C02 = + aie::load_v(pC1 + 2 * MMUL::size_C * rowA); + aie::vector acc_C03 = + aie::load_v(pC1 + 3 * MMUL::size_C * rowA); + + aie::vector acc_C10 = + aie::load_v(pC2); + aie::vector acc_C11 = + aie::load_v(pC2 + MMUL::size_C * rowA); + aie::vector acc_C12 = + aie::load_v(pC2 + 2 * MMUL::size_C * rowA); + aie::vector acc_C13 = + aie::load_v(pC2 + 3 * MMUL::size_C * rowA); + + aie::vector acc_C20 = + aie::load_v(pC3); + aie::vector acc_C21 = + aie::load_v(pC3 + MMUL::size_C * rowA); + aie::vector acc_C22 = + aie::load_v(pC3 + 2 * MMUL::size_C * rowA); + aie::vector acc_C23 = + aie::load_v(pC3 + 3 * MMUL::size_C * rowA); + + aie::vector acc_C30 = + aie::load_v(pC4); + aie::vector acc_C31 = + aie::load_v(pC4 + MMUL::size_C * rowA); + aie::vector acc_C32 = + aie::load_v(pC4 + 2 * MMUL::size_C * rowA); + aie::vector acc_C33 = + aie::load_v(pC4 + 3 * MMUL::size_C * rowA); + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C02(acc_C02); + MMUL C03(acc_C03); + + MMUL C10(acc_C10); + MMUL C11(acc_C11); + MMUL C12(acc_C12); + MMUL C13(acc_C13); + + MMUL C20(acc_C20); + MMUL C21(acc_C21); + MMUL C22(acc_C22); + MMUL C23(acc_C23); + + MMUL C30(acc_C30); + MMUL C31(acc_C31); + MMUL C32(acc_C32); + MMUL C33(acc_C33); + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + + C02.mac(A0, B2); + C03.mac(A0, B3); + C12.mac(A1, B2); + C13.mac(A1, B3); + + C20.mac(A2, B0); + C21.mac(A2, B1); + C30.mac(A3, B0); + C31.mac(A3, B1); + + C22.mac(A2, B2); + C23.mac(A2, B3); + C32.mac(A3, B2); + C33.mac(A3, B3); + + for (unsigned i = 1; i < colA; ++i) + chess_prepare_for_pipelining chess_loop_range(7, ) { + A0 = aie::load_v(pA1); + pA1 += rowA * MMUL::size_A; + A1 = aie::load_v(pA2); + pA2 += rowA * MMUL::size_A; + A2 = aie::load_v(pA3); + pA3 += rowA * MMUL::size_A; + A3 = aie::load_v(pA4); + pA4 += rowA * MMUL::size_A; + + B0 = aie::load_v(pB1); + pB1 += MMUL::size_B; + B1 = aie::load_v(pB2); + pB2 += MMUL::size_B; + B2 = aie::load_v(pB3); + pB3 += MMUL::size_B; + B3 = aie::load_v(pB4); + pB4 += MMUL::size_B; + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + + C02.mac(A0, B2); + C03.mac(A0, B3); + C12.mac(A1, B2); + C13.mac(A1, B3); + + C20.mac(A2, B0); + C21.mac(A2, B1); + C30.mac(A3, B0); + C31.mac(A3, B1); + + C22.mac(A2, B2); + C23.mac(A2, B3); + C32.mac(A3, B2); + C33.mac(A3, B3); + } + + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C * rowA; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C * rowA; + aie::store_v(pC1, C02.template to_vector()); + pC1 += MMUL::size_C * rowA; + aie::store_v(pC1, C03.template to_vector()); + pC1 += MMUL::size_C * rowA; + + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C * rowA; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C * rowA; + aie::store_v(pC2, C12.template to_vector()); + pC2 += MMUL::size_C * rowA; + aie::store_v(pC2, C13.template to_vector()); + pC2 += MMUL::size_C * rowA; + + aie::store_v(pC3, C20.template to_vector()); + pC3 += MMUL::size_C * rowA; + aie::store_v(pC3, C21.template to_vector()); + pC3 += MMUL::size_C * rowA; + aie::store_v(pC3, C22.template to_vector()); + pC3 += MMUL::size_C * rowA; + aie::store_v(pC3, C23.template to_vector()); + pC3 += MMUL::size_C * rowA; + + aie::store_v(pC4, C30.template to_vector()); + pC4 += MMUL::size_C * rowA; + aie::store_v(pC4, C31.template to_vector()); + pC4 += MMUL::size_C * rowA; + aie::store_v(pC4, C32.template to_vector()); + pC4 += MMUL::size_C * rowA; + aie::store_v(pC4, C33.template to_vector()); + pC4 += MMUL::size_C * rowA; + } + } + + event1(); +} + +template +void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA, + const bfloat16 *__restrict pB, + bfloat16 *__restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 4; + static_assert(m % (2 * r) == 0 && m / (2 * r) > 0); + static_assert(k % (2 * s) == 0 && k / (2 * s) > 0); + static_assert(n % (2 * t) == 0 && n / (2 * t) > 0); + return matmul_vectorized_2x2(pA, pB, pC); +} + +extern "C" { + +#define combos(X) X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4) + +#define matmul_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ + mlir_type_out, r, s, t) \ + void matmul_##mlir_type_in##_##mlir_type_out(ctype_in *a_in, ctype_in *b_in, \ + ctype_out *c_out) { \ + matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out< \ + 64, 64, 64>(a_in, b_in, c_out); \ + } + +#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ + mlir_type_out, r, s, t) \ + void linalg_fill_bf16_view1x1x16x16x4x4xbf16as2(ctype_out *c_out) { \ + zero_vectorized(c_out); \ + } + +combos(matmul_vectorized_c_func) combos(zero_vectorized_c_func) + +} // extern "C" diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/run.lit b/test/xrt/24_ctrlpkt_config_2gemms_4x4/run.lit new file mode 100644 index 000000000..a18b91ab9 --- /dev/null +++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/run.lit @@ -0,0 +1,19 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o mm.o +// RUN: %python %S/aie.py +// RUN: %python %S/aie2.py +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --generate-ctrl-pkt-overlay --xclbin-name=base.xclbin %S/base.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --npu-insts-name=aie_run_seq.txt aie.mlir +// RUN: aie-translate -aie-ctrlpkt-to-bin -aie-sequence-name=configure aie.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt +// RUN: aie-opt -aie-ctrl-packet-to-dma -aie-dma-to-npu aie.mlir.prj/ctrlpkt.mlir -o ctrlpkt_dma_seq.mlir +// RUN: aie-translate -aie-npu-instgen -aie-sequence-name=configure ctrlpkt_dma_seq.mlir -o ctrlpkt_dma_seq.txt +// RUN: %python %S/aie2.py +// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --npu-insts-name=aie2_run_seq.txt aie2.mlir +// RUN: aie-translate -aie-ctrlpkt-to-bin -aie-sequence-name=configure aie2.mlir.prj/ctrlpkt.mlir -o aie2_ctrlpkt.txt +// RUN: aie-opt -aie-ctrl-packet-to-dma -aie-dma-to-npu aie2.mlir.prj/ctrlpkt.mlir -o aie2_ctrlpkt_dma_seq.mlir +// RUN: aie-translate -aie-npu-instgen -aie-sequence-name=configure aie2_ctrlpkt_dma_seq.mlir -o aie2_ctrlpkt_dma_seq.txt +// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe -x base.xclbin -k MLIR_AIE diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/test.cpp b/test/xrt/24_ctrlpkt_config_2gemms_4x4/test.cpp new file mode 100644 index 000000000..faa840530 --- /dev/null +++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/test.cpp @@ -0,0 +1,366 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "experimental/xrt_kernel.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "matrix_multiplication.h" + +constexpr int M = 512; +constexpr int K1 = 512; +constexpr int K2 = 1024; +constexpr int N = 512; + +constexpr int A_VOLUME_1 = M * K1; +constexpr int B_VOLUME_1 = N * K1; +constexpr int C_VOLUME_1 = M * N; +constexpr int A_VOLUME_2 = M * K2; +constexpr int B_VOLUME_2 = N * K2; +constexpr int C_VOLUME_2 = M * N; + +using A_DATATYPE = std::bfloat16_t; +using B_DATATYPE = std::bfloat16_t; +using C_DATATYPE = std::bfloat16_t; + +constexpr int A_SIZE_1 = (A_VOLUME_1 * sizeof(A_DATATYPE)); +constexpr int B_SIZE_1 = (B_VOLUME_1 * sizeof(B_DATATYPE)); +constexpr int C_SIZE_1 = (C_VOLUME_1 * sizeof(C_DATATYPE)); + +constexpr int A_SIZE_2 = (A_VOLUME_2 * sizeof(A_DATATYPE)); +constexpr int B_SIZE_2 = (B_VOLUME_2 * sizeof(B_DATATYPE)); +constexpr int C_SIZE_2 = (C_VOLUME_2 * sizeof(C_DATATYPE)); + +constexpr bool VERIFY = true; + +namespace po = boost::program_options; + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + po::variables_map vm; + matmul_common::add_default_options(desc); + matmul_common::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + + srand(time(NULL)); + + std::vector instr1_v = + matmul_common::load_instr_sequence("aie_run_seq.txt"); + + std::vector ctrlpkt_instr1_v = + matmul_common::load_instr_sequence("ctrlpkt_dma_seq.txt"); + + std::vector ctrlPackets1 = + matmul_common::load_instr_sequence("ctrlpkt.txt"); + + std::vector instr2_v = + matmul_common::load_instr_sequence("aie2_run_seq.txt"); + + std::vector ctrlpkt_instr2_v = + matmul_common::load_instr_sequence("aie2_ctrlpkt_dma_seq.txt"); + + std::vector ctrlPackets2 = + matmul_common::load_instr_sequence("aie2_ctrlpkt.txt"); + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_ctrlpkt_instr1 = + xrt::bo(device, ctrlpkt_instr1_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_ctrlpkt1 = xrt::bo(device, ctrlPackets1.size() * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_instr1 = xrt::bo(device, instr1_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_a1 = + xrt::bo(device, A_SIZE_1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_b1 = + xrt::bo(device, B_SIZE_1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_c1 = + xrt::bo(device, C_SIZE_1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + auto bo_ctrlpkt_instr2 = + xrt::bo(device, ctrlpkt_instr2_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_ctrlpkt2 = xrt::bo(device, ctrlPackets2.size() * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_instr2 = xrt::bo(device, instr2_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_a2 = + xrt::bo(device, A_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_b2 = + xrt::bo(device, B_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_c2 = + xrt::bo(device, C_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + A_DATATYPE *bufA1 = bo_a1.map(); + std::vector AVec1(A_VOLUME_1); + for (int i = 0; i < A_VOLUME_1; i++) { + AVec1[i] = matmul_common::random_bfloat16_t(); + } + memcpy(bufA1, AVec1.data(), (AVec1.size() * sizeof(A_DATATYPE))); + B_DATATYPE *bufB1 = bo_b1.map(); + std::vector BVec1(B_VOLUME_1); + for (int i = 0; i < B_VOLUME_1; i++) { + BVec1[i] = matmul_common::random_bfloat16_t(); + } + memcpy(bufB1, BVec1.data(), (BVec1.size() * sizeof(B_DATATYPE))); + C_DATATYPE *bufC1 = bo_c1.map(); + std::vector CVec1(C_VOLUME_1); + memcpy(bufC1, CVec1.data(), (CVec1.size() * sizeof(C_DATATYPE))); + + void *bufInstr1 = bo_instr1.map(); + memcpy(bufInstr1, instr1_v.data(), instr1_v.size() * sizeof(int)); + + void *bufCtrlpktInstr1 = bo_ctrlpkt_instr1.map(); + memcpy(bufCtrlpktInstr1, ctrlpkt_instr1_v.data(), + ctrlpkt_instr1_v.size() * sizeof(int)); + + void *bufctrlpkt1 = bo_ctrlpkt1.map(); + memcpy(bufctrlpkt1, ctrlPackets1.data(), ctrlPackets1.size() * sizeof(int)); + + A_DATATYPE *bufA2 = bo_a2.map(); + std::vector AVec2(A_VOLUME_2); + for (int i = 0; i < A_VOLUME_2; i++) { + AVec2[i] = matmul_common::random_bfloat16_t(); + } + memcpy(bufA2, AVec2.data(), (AVec2.size() * sizeof(A_DATATYPE))); + B_DATATYPE *bufB2 = bo_b2.map(); + std::vector BVec2(B_VOLUME_2); + for (int i = 0; i < B_VOLUME_2; i++) { + BVec2[i] = matmul_common::random_bfloat16_t(); + } + memcpy(bufB2, BVec2.data(), (BVec2.size() * sizeof(B_DATATYPE))); + C_DATATYPE *bufC2 = bo_c2.map(); + std::vector CVec2(C_VOLUME_2); + memcpy(bufC2, CVec2.data(), (CVec2.size() * sizeof(C_DATATYPE))); + + void *bufInstr2 = bo_instr2.map(); + memcpy(bufInstr2, instr2_v.data(), instr2_v.size() * sizeof(int)); + + void *bufCtrlpktInstr2 = bo_ctrlpkt_instr2.map(); + memcpy(bufCtrlpktInstr2, ctrlpkt_instr2_v.data(), + ctrlpkt_instr2_v.size() * sizeof(int)); + + void *bufctrlpkt2 = bo_ctrlpkt2.map(); + memcpy(bufctrlpkt2, ctrlPackets2.data(), ctrlPackets2.size() * sizeof(int)); + + bo_ctrlpkt_instr1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_ctrlpkt1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_instr1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_a1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_b1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_c1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + bo_ctrlpkt_instr2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_ctrlpkt2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_instr2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_a2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_b2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_c2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned num_iter = 1; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + auto start = std::chrono::high_resolution_clock::now(); + unsigned int opcode = 3; + + // Creating a runlist to contain two seperate runs + xrt::runlist runlist = xrt::runlist(context); + + // Run 0: configuration + auto run0 = xrt::run(kernel); + run0.set_arg(0, opcode); + run0.set_arg(1, bo_ctrlpkt_instr1); + run0.set_arg(2, ctrlpkt_instr1_v.size()); + run0.set_arg(3, bo_ctrlpkt1); + run0.set_arg(4, 0); + run0.set_arg(5, 0); + run0.set_arg(6, 0); + run0.set_arg(7, 0); + // Run 1: the design + auto run1 = xrt::run(kernel); + run1.set_arg(0, opcode); + run1.set_arg(1, bo_instr1); + run1.set_arg(2, instr1_v.size()); + run1.set_arg(3, bo_a1); + run1.set_arg(4, bo_b1); + run1.set_arg(5, bo_c1); + run1.set_arg(6, 0); + run1.set_arg(7, 0); + + // Run 2: configuration + auto run2 = xrt::run(kernel); + run2.set_arg(0, opcode); + run2.set_arg(1, bo_ctrlpkt_instr2); + run2.set_arg(2, ctrlpkt_instr2_v.size()); + run2.set_arg(3, bo_ctrlpkt2); + run2.set_arg(4, 0); + run2.set_arg(5, 0); + run2.set_arg(6, 0); + run2.set_arg(7, 0); + // Run 3: the design + auto run3 = xrt::run(kernel); + run3.set_arg(0, opcode); + run3.set_arg(1, bo_instr2); + run3.set_arg(2, instr2_v.size()); + run3.set_arg(3, bo_a2); + run3.set_arg(4, bo_b2); + run3.set_arg(5, bo_c2); + run3.set_arg(6, 0); + run3.set_arg(7, 0); + + // Executing and waiting on the runlist + runlist.add(run0); + runlist.add(run1); + runlist.add(run2); + runlist.add(run3); + runlist.execute(); + runlist.wait(); + + auto stop = std::chrono::high_resolution_clock::now(); + + bo_c1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bo_c2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + memcpy(CVec1.data(), bufC1, (CVec1.size() * sizeof(C_DATATYPE))); + std::vector CVecRef1(C_VOLUME_1); + if (VERIFY) { + if (verbosity >= 1) { + std::cout << "Verifying against reference matmul ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + matmul_common::matmul(M, N, K1, AVec1, BVec1, CVecRef1); + errors = matmul_common::verify(M, N, K1, AVec1, BVec1, CVec1); + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: matmul results not verified." << std::endl; + } + + memcpy(CVec2.data(), bufC2, (CVec2.size() * sizeof(C_DATATYPE))); + std::vector CVecRef2(C_VOLUME_2); + if (VERIFY) { + if (verbosity >= 1) { + std::cout << "Verifying against reference matmul ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + matmul_common::matmul(M, N, K2, AVec2, BVec2, CVecRef2); + errors = matmul_common::verify(M, N, K2, AVec2, BVec2, CVec2); + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: matmul results not verified." << std::endl; + } + + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + std::cout << std::endl + << "Avg NPU matmul time: " << npu_time_total / num_iter << "us." + << std::endl; + + std::cout << std::endl + << "Min NPU matmul time: " << npu_time_min << "us." << std::endl; + + std::cout << std::endl + << "Max NPU matmul time: " << npu_time_max << "us." << std::endl; + + if (VERIFY && !errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} diff --git a/test/xrt/24_ctrlpkt_config_2gemms_4x4/zero.cc b/test/xrt/24_ctrlpkt_config_2gemms_4x4/zero.cc new file mode 100644 index 000000000..8c13b601d --- /dev/null +++ b/test/xrt/24_ctrlpkt_config_2gemms_4x4/zero.cc @@ -0,0 +1,33 @@ +//===- zero.cc --------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#ifndef ZERO_CC +#define ZERO_CC + +#include +#include +#include +#include + +template +void zero_vectorized(T *__restrict c) { + const aie::vector zeros = aie::zeros(); + const T *__restrict c_end = c + M * N; + for (; c + r < c_end; c += r) { + aie::store_v(c, zeros); + } + // Do a scalar write for any remainder not divisible by vector instruction + // size r + for (; c < c_end; c++) { + *c = 0; + } +} + +#endif