[AMD] Add support for gpu.barrier in pipelining epilogue (#4740)

When loop pipelining, do not predicate gpu.barrier. It should be unconditional.
triton-lang · Sep 18, 2024 · 15734f6 · 15734f6
1 parent 9df25c5
commit 15734f6
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 0 deletions.
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -159,3 +159,42 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1
     tt.return
   }
 } // end module
+
+// -----
+
+// CHECK-LABEL: tt.func public @add_barrier_kernel
+// CHECK:  tt.load
+// CHECK:  scf.for
+// CHECK:    gpu.barrier
+// CHECK:    tt.store
+// CHECK:    tt.load
+// CHECK:    scf.yield
+// CHECK:  gpu.barrier
+// CHECK:  tt.store
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  tt.func public @add_barrier_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cval_f32 = arith.constant dense<0.3> : tensor<1024xf32, #blocked>
+    %c1016800_i32 = arith.constant 1016800 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %6 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    scf.for %arg4 = %c0_i32 to %arg3 step %c1024_i32  : i32 {
+      %7 = arith.addi %1, %arg4 : i32
+      %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked>
+      %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked>
+      %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+      %12 = tt.load %11 : tensor<1024x!tt.ptr<f32>, #blocked>
+      gpu.barrier
+      %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+      %15 = arith.addf %12, %cval_f32 : tensor<1024xf32, #blocked>
+      tt.store %16, %15 : tensor<1024x!tt.ptr<f32>, #blocked>
+    } {tt.num_stages = 2 : i32}
+    tt.return
+  }
+} // end module
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
@@ -558,6 +558,8 @@ static Operation *streamPredication(RewriterBase &rewriter, Operation *op,
     dotOp->moveBefore(yield);
     ifOp.getElseBodyBuilder().create<scf::YieldOp>(loc, dotOp.getC());
     return ifOp;
+  } else if (isa<gpu::BarrierOp>(op)) {
+    return op;
   }
   return tt::predicateOp(rewriter, op, pred);
 }