From e192dba224c673671ae70f73842fc693ca279a45 Mon Sep 17 00:00:00 2001 From: Ognjen Plavsic <130548569+oplavsic@users.noreply.github.com> Date: Tue, 10 Sep 2024 00:14:26 +0200 Subject: [PATCH] [AMD] Hoist Q out of the loop for FA optimization (#4666) Move writing to LDS and reading from LDS right after the loading of a tensor from global memory. This PR does reordering by considering 2 possible patterns depending on whether writing to LDS is done using an optional local_alloc argument or a local_store instruction: 1) load -> local_alloc -> local_store -> local_load, 2) load -> local_alloc -> local_load. --------- Co-authored-by: Ognjen Plavsic Co-authored-by: Lei Zhang --- .../amd/amd-reorder-instructions.mlir | 25 +++++++-- .../ReorderInstructions.cpp | 53 +++++++++++++++++++ 2 files changed, 74 insertions(+), 4 deletions(-) diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index 4e0139cd4dda..d680c08c1852 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -1,18 +1,35 @@ // RUN: triton-opt %s -split-input-file -tritonamdgpu-reorder-instructions | FileCheck %s -// Check that we order load, local_alloc and local_load one after another. This is useful +// Check that we order load, local_alloc, local_store (optional) and local_load one after another. This is useful // for making sure that Q tensor in FA is hoisted out of the main loop and kept in registers // throughout the computation. // CHECK-LABEL: order_load_alloc_local_load // CHECK: %[[LOAD:.+]] = tt.load -// CHECK: %[[ALLOC:.+]] = triton_gpu.local_alloc -// CHECK: triton_gpu.local_store %[[LOAD]], %[[ALLOC]] -// CHECK: triton_gpu.local_load %[[ALLOC]] +// CHECK-NEXT: %[[ALLOC:.+]] = triton_gpu.local_alloc %[[LOAD]] +// CHECK-NEXT: triton_gpu.local_load %[[ALLOC]] #blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}> #mma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}> module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @order_load_alloc_local_load(%arg0: tensor<32x32x!tt.ptr, #blocked>) attributes {noinline = false} { + %9 = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %10 = triton_gpu.local_alloc %9 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared> + %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %11 = triton_gpu.local_load %10 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %12 = tt.dot %11, %cst_0, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x32xf32, #mma> + %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %arg0, %13 : tensor<32x32x!tt.ptr, #blocked> + tt.return + } +} +// CHECK-LABEL: order_load_alloc_local_load_local_store +// CHECK: %[[LOAD:.+]] = tt.load +// CHECK: %[[ALLOC:.+]] = triton_gpu.local_alloc +// CHECK: triton_gpu.local_store %[[LOAD]], %[[ALLOC]] +// CHECK: triton_gpu.local_load %[[ALLOC]] +module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { + tt.func public @order_load_alloc_local_load_local_store(%arg0: tensor<32x32x!tt.ptr, #blocked>) attributes {noinline = false} { %9 = tt.load %arg0 : tensor<32x32x!tt.ptr, #blocked> %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %10 = triton_gpu.local_alloc : () -> !tt.memdesc<32x32xf32, #shared, mutable> diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index 0ee89540171f..c46b76ed48e2 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -14,6 +14,7 @@ using namespace mlir; namespace ttg = mlir::triton::gpu; +namespace tt = mlir::triton; static bool isLocalLoadOrDotLayoutConversion(Operation *op) { if (isa(op)) @@ -97,6 +98,58 @@ class TritonAMDGPUReorderInstructionsPass kv.first->moveBefore(kv.second); opToMove.clear(); + // Move writing to LDS and reading from LDS right after the loading of a + // tensor from global memory. There are 2 possible patterns depending on + // whether writing to LDS is done using an optional local_alloc argument or + // a local_store instruction: + // + // 1) %1 = load %ptr + // %2 = local_alloc %1 + // %3 = local_load %2 + // + // 2) %1 = load %ptr + // %2 = local_alloc + // %3 = local_store %1, %2 + // %4 = local_load %2 + m.walk([&](ttg::LocalLoadOp localLoad) { + auto localAlloc = localLoad.getSrc().getDefiningOp(); + if (!localAlloc) + return; + + // Case when localAlloc has operands + if (localAlloc->getNumOperands() == 1) { + if (!localAlloc->hasOneUse()) + return; + auto loadOp = localAlloc->getOperand(0).getDefiningOp(); + if (!loadOp) + return; + localAlloc->moveAfter(loadOp); + localLoad->moveAfter(localAlloc); + return; + } + + // Case when localAlloc has no operands + assert(localAlloc->getNumOperands() < 1); + auto allocVal = localAlloc->getResult(0); + + // Check if the localAlloc has exactly two uses (localStore and localLoad) + int numUses = std::distance(allocVal.use_begin(), allocVal.use_end()); + if (numUses != 2) + return; + + // localStore comes before localLoad in block. + Operation *localStore = getFirstUse(localAlloc); + if (!isa(localStore)) + return; + + auto loadOp = localStore->getOperand(0).getDefiningOp(); + if (!loadOp) + return; + localAlloc->moveAfter(loadOp); + localStore->moveAfter(localAlloc); + localLoad->moveAfter(localStore); + }); + // Sink conversion after the last dealloc but before the first use ancestor // in its block. This helps to avoid unnecessary shared memory allocation. m.walk([&](triton::gpu::ConvertLayoutOp op) {