From e192dba224c673671ae70f73842fc693ca279a45 Mon Sep 17 00:00:00 2001
From: Ognjen Plavsic <130548569+oplavsic@users.noreply.github.com>
Date: Tue, 10 Sep 2024 00:14:26 +0200
Subject: [PATCH] [AMD] Hoist Q out of the loop for FA optimization (#4666)

Move writing to LDS and reading from LDS right after the loading of a
tensor from global memory. This PR does reordering by considering 2
possible patterns depending on whether writing to LDS is done using an
optional local_alloc argument or a local_store instruction: 1) load ->
local_alloc -> local_store -> local_load, 2) load -> local_alloc ->
local_load.

---------

Co-authored-by: Ognjen Plavsic <ognjen.plavsic@luxoft.com>
Co-authored-by: Lei Zhang <antiagainst@gmail.com>
---
 .../amd/amd-reorder-instructions.mlir         | 25 +++++++--
 .../ReorderInstructions.cpp                   | 53 +++++++++++++++++++
 2 files changed, 74 insertions(+), 4 deletions(-)
diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir
index 4e0139cd4dda..d680c08c1852 100644
--- a/test/TritonGPU/amd/amd-reorder-instructions.mlir
+++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir
@@ -1,18 +1,35 @@
 // RUN: triton-opt %s -split-input-file -tritonamdgpu-reorder-instructions | FileCheck %s
 
-// Check that we order load, local_alloc and local_load one after another. This is useful
+// Check that we order load, local_alloc, local_store (optional) and local_load one after another. This is useful
 // for making sure that Q tensor in FA is hoisted out of the main loop and kept in registers
 // throughout the computation.
 // CHECK-LABEL: order_load_alloc_local_load
 //       CHECK:   %[[LOAD:.+]] = tt.load
-//       CHECK:   %[[ALLOC:.+]] = triton_gpu.local_alloc
-//       CHECK:   triton_gpu.local_store %[[LOAD]], %[[ALLOC]]
-//       CHECK:   triton_gpu.local_load %[[ALLOC]]
+//       CHECK-NEXT:   %[[ALLOC:.+]] = triton_gpu.local_alloc %[[LOAD]]
+//       CHECK-NEXT:   triton_gpu.local_load %[[ALLOC]]
 #blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}>
 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   tt.func public @order_load_alloc_local_load(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked>) attributes {noinline = false} {
+    %9 = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
+    %10 = triton_gpu.local_alloc %9 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared>
+    %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+    %11 = triton_gpu.local_load %10 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %12 = tt.dot %11, %cst_0, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x32xf32, #mma>
+    %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked>
+    tt.store %arg0, %13 : tensor<32x32x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+// CHECK-LABEL: order_load_alloc_local_load_local_store
+//       CHECK:   %[[LOAD:.+]] = tt.load
+//       CHECK:   %[[ALLOC:.+]] = triton_gpu.local_alloc
+//       CHECK:   triton_gpu.local_store %[[LOAD]], %[[ALLOC]]
+//       CHECK:   triton_gpu.local_load %[[ALLOC]]
+module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  tt.func public @order_load_alloc_local_load_local_store(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked>) attributes {noinline = false} {
     %9 = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
     %10 = triton_gpu.local_alloc : () -> !tt.memdesc<32x32xf32, #shared, mutable>
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
index 0ee89540171f..c46b76ed48e2 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
@@ -14,6 +14,7 @@
 
 using namespace mlir;
 namespace ttg = mlir::triton::gpu;
+namespace tt = mlir::triton;
 
 static bool isLocalLoadOrDotLayoutConversion(Operation *op) {
   if (isa<ttg::LocalLoadOp>(op))
@@ -97,6 +98,58 @@ class TritonAMDGPUReorderInstructionsPass
       kv.first->moveBefore(kv.second);
     opToMove.clear();
 
+    // Move writing to LDS and reading from LDS right after the loading of a
+    // tensor from global memory. There are 2 possible patterns depending on
+    // whether writing to LDS is done using an optional local_alloc argument or
+    // a local_store instruction:
+    //
+    // 1) %1 = load %ptr
+    //    %2 = local_alloc %1
+    //    %3 = local_load %2
+    //
+    // 2) %1 = load %ptr
+    //    %2 = local_alloc
+    //    %3 = local_store %1, %2
+    //    %4 = local_load %2
+    m.walk([&](ttg::LocalLoadOp localLoad) {
+      auto localAlloc = localLoad.getSrc().getDefiningOp<ttg::LocalAllocOp>();
+      if (!localAlloc)
+        return;
+
+      // Case when localAlloc has operands
+      if (localAlloc->getNumOperands() == 1) {
+        if (!localAlloc->hasOneUse())
+          return;
+        auto loadOp = localAlloc->getOperand(0).getDefiningOp<tt::LoadOp>();
+        if (!loadOp)
+          return;
+        localAlloc->moveAfter(loadOp);
+        localLoad->moveAfter(localAlloc);
+        return;
+      }
+
+      // Case when localAlloc has no operands
+      assert(localAlloc->getNumOperands() < 1);
+      auto allocVal = localAlloc->getResult(0);
+
+      // Check if the localAlloc has exactly two uses (localStore and localLoad)
+      int numUses = std::distance(allocVal.use_begin(), allocVal.use_end());
+      if (numUses != 2)
+        return;
+
+      // localStore comes before localLoad in block.
+      Operation *localStore = getFirstUse(localAlloc);
+      if (!isa<ttg::LocalStoreOp>(localStore))
+        return;
+
+      auto loadOp = localStore->getOperand(0).getDefiningOp<tt::LoadOp>();
+      if (!loadOp)
+        return;
+      localAlloc->moveAfter(loadOp);
+      localStore->moveAfter(localAlloc);
+      localLoad->moveAfter(localStore);
+    });
+
     // Sink conversion after the last dealloc but before the first use ancestor
     // in its block. This helps to avoid unnecessary shared memory allocation.
     m.walk([&](triton::gpu::ConvertLayoutOp op) {