Skip to content

Commit

Permalink
[BACKEND] Improve perf of tensormap_fenceproxy_acquire (#4720)
Browse files Browse the repository at this point in the history
The fence turns out to be fairly expensive, and it's cheaper to perform
the fence on a single warp and use a barrier to synchronize the
remaining threads.
  • Loading branch information
peterbell10 authored Sep 12, 2024
1 parent c238af8 commit df26ec6
Showing 1 changed file with 10 additions and 3 deletions.
13 changes: 10 additions & 3 deletions third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
#include "PatternTritonGPUOpToLLVM.h"
#include "TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"

#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/IR/Value.h"
#include "mlir/Transforms/DialectConversion.h"
#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
#include "triton/Dialect/Triton/IR/Dialect.h"
#include "triton/Dialect/Triton/IR/Types.h"
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"

using namespace mlir;
using namespace mlir::triton;
Expand Down Expand Up @@ -195,12 +194,20 @@ struct ExperimentalTensormapFenceproxyAcquireOpConversion
auto *sizeOpr = ptxBuilder.newConstantOperand(TMA_SIZE_BYTES);

// Define the instruction opcode
constexpr int kWarpSize = 32;
Value threadId = getThreadId(rewriter, loc);
Value pred = icmp_slt(threadId, i32_val(kWarpSize));
auto &fence =
*ptxBuilder.create<>("fence.proxy.tensormap::generic.acquire.gpu");
fence(descAddrOpr, sizeOpr);
fence(descAddrOpr, sizeOpr).predicate(pred);

ptxBuilder.launch(rewriter, loc, getVoidType());

// We run the fence on a single warp, then use a barrier to synchronize the
// rest. This ends up being faster than running the fence on each warp.
// TODO: Ideally we only emit one barrier after all fences are issued
rewriter.create<NVVM::Barrier0Op>(loc);

rewriter.eraseOp(op);
return success();
}
Expand Down

0 comments on commit df26ec6

Please sign in to comment.