From d2e645e04037267c22636a599ed168112599859a Mon Sep 17 00:00:00 2001 From: SJW Date: Tue, 10 Sep 2024 20:18:51 +0000 Subject: [PATCH] * clarified the comments --- .../amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 3ee667384f8d..b2c9158e5b43 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -545,7 +545,10 @@ createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, static Operation *streamPredication(RewriterBase &rewriter, Operation *op, Value pred) { - // Predicate dot so select will be removed (reduces register pressure) + // The epilogue peeling generates a select for the stage output. This causes + // too much register pressure with the loop result and the epilogue-dot in + // regs for the select. Conditionally executing the dot will allow the backend + // to optimize the select away as redundant. if (auto dotOp = dyn_cast(op)) { auto loc = dotOp->getLoc(); auto ifOp = rewriter.create(loc, dotOp.getResult().getType(), pred, true);