forked from triton-lang/triton
-
Notifications
You must be signed in to change notification settings - Fork 29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FA fwd D=128] Reduce LDS usage in epilogue #340
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
8f8b6f1
rebase onto improve_fwd_fa
zhanglx13 eff2b83
Fixed a leftover from rebase
zhanglx13 2e22055
rebase onto improve_fa_fwd
zhanglx13 cb381e5
Reduce tuning space
zhanglx13 379374e
Disable bwd with D=128
zhanglx13 a1bea58
Add test for d=128
zhanglx13 169d6d5
Fix an issue with get_best_config when there is only one config
zhanglx13 c9b6b3b
Added better configs for d=128
zhanglx13 8348b79
Fix typos
zhanglx13 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,10 @@ static void addWSNamedAttrs(Operation *op, | |
op->setAttr(attr.getName(), attr.getValue()); | ||
} | ||
|
||
#ifdef USE_ROCM | ||
constexpr int LDSSize = 65536; | ||
constexpr int kPtrBitWidth = 64; | ||
#endif | ||
class TritonLLVMFunctionConversionTarget : public ConversionTarget { | ||
public: | ||
explicit TritonLLVMFunctionConversionTarget(MLIRContext &ctx, Target target) | ||
|
@@ -410,6 +414,7 @@ struct ConvertTritonGPUToLLVM | |
decomposeMmaToDotOperand(mod, numWarps, threadsPerWarp, numCTAs); | ||
#ifdef USE_ROCM | ||
decomposeMfmaToDotOperand(mod, numWarps, threadsPerWarp, numCTAs); | ||
reduceCvtOpLDSUsage(mod); | ||
#endif | ||
decomposeBlockedToDotOperand(mod); | ||
decomposeInsertSliceAsyncOp(mod); | ||
|
@@ -710,6 +715,151 @@ struct ConvertTritonGPUToLLVM | |
} | ||
}); | ||
} | ||
|
||
int getCvtOpLDSUsage(triton::gpu::ConvertLayoutOp &cvtOp) const { | ||
unsigned inVec = 0; | ||
unsigned outVec = 0; | ||
auto smemShape = getScratchConfigForCvtLayout(cvtOp, inVec, outVec); | ||
unsigned elems = std::accumulate(smemShape.begin(), smemShape.end(), 1, | ||
std::multiplies{}); | ||
auto srcType = cvtOp.getOperand().getType().cast<RankedTensorType>(); | ||
auto bytes = | ||
srcType.getElementType().isa<triton::PointerType>() | ||
? elems * kPtrBitWidth / 8 | ||
: elems * std::max<int>(8, srcType.getElementTypeBitWidth()) / 8; | ||
|
||
return bytes; | ||
} | ||
|
||
bool isPowerOfTwo(unsigned x) const { return x && (x & (x - 1)) == 0; } | ||
|
||
std::vector<std::pair<int, int>> factorizePowerOf2(int n) const { | ||
assert(isPowerOfTwo(n)); | ||
int x = log2(n); | ||
std::vector<std::pair<int, int>> pairs; | ||
|
||
for (int i = 0; i <= x / 2; ++i) { | ||
int j = x - i; | ||
pairs.push_back({pow(2, i), pow(2, j)}); | ||
pairs.push_back({pow(2, j), pow(2, i)}); | ||
} | ||
|
||
return pairs; | ||
} | ||
|
||
std::pair<triton::gpu::ConvertLayoutOp, triton::gpu::ConvertLayoutOp> | ||
createNewConvertOps(ModuleOp &mod, OpBuilder &builder, | ||
triton::gpu::ConvertLayoutOp &cvtOp, | ||
std::pair<unsigned, unsigned> warpsPerCta) const { | ||
unsigned warpsPerCtaX = warpsPerCta.first; | ||
unsigned warpsPerCtaY = warpsPerCta.second; | ||
auto srcType = cvtOp.getOperand().getType().cast<RankedTensorType>(); | ||
auto dstType = cvtOp.getType().cast<RankedTensorType>(); | ||
|
||
auto srcMfma = | ||
srcType.getEncoding().dyn_cast<triton::gpu::MfmaEncodingAttr>(); | ||
auto newMfmaEnc = triton::gpu::MfmaEncodingAttr::get( | ||
mod.getContext(), srcMfma.getNonKDim(), {warpsPerCtaX, warpsPerCtaY}, | ||
srcMfma.getIsTransposed(), srcMfma.getCTALayout()); | ||
|
||
auto newDstType = RankedTensorType::get( | ||
dstType.getShape(), dstType.getElementType(), dstType.getEncoding()); | ||
auto newSrcType = RankedTensorType::get( | ||
srcType.getShape(), srcType.getElementType(), newMfmaEnc); | ||
|
||
auto tmpCvt = builder.create<triton::gpu::ConvertLayoutOp>( | ||
cvtOp.getLoc(), newSrcType, cvtOp.getOperand()); | ||
auto newEpilogueCvt = builder.create<triton::gpu::ConvertLayoutOp>( | ||
cvtOp.getLoc(), newDstType, tmpCvt); | ||
|
||
return std::make_pair(tmpCvt, newEpilogueCvt); | ||
} | ||
|
||
// Try to reduce LDS usage of cvt(mfma->blocked) op by changing the shape of | ||
// WarpsPerCta attribute in mfma layout. The implicit LDS usage of | ||
// cvt(mfma->blocked) op depends on the number of warps per CTA that mfma | ||
// layout uses along x dimension and block layout uses across y dimension. | ||
// | ||
// clang-format off | ||
// | ||
// LDS usage of this op is roughly calculated as: | ||
// LDS_USAGE = getShapePerCTA(mfma_layout)[0] * getShapePerCTA(blocked_layout)[1] * sizeof(data_type) | ||
// LDS_USAGE = warpsPerCTA(mfma_layout)[0] * warpsPerCta(blocked_layout)[1] * C, | ||
// where C = 32 * sizePerWarp(blocked_layout)[1] * threadsPerWarp(blocked_layout)[1] * sizeof(data_type) | ||
// | ||
// clang-format on | ||
// | ||
// When LDS_USAGE exceeds the size of LDS, try to lower LDS usage by | ||
// decomposing cvt(mfma->blocked) op into 2 conversions: cvt(mfma->mfma_tmp) | ||
// and cvt(mfma_tmp->blocked), where mfma_tmp has WarpsPerCta attribute that | ||
// minimizes uses of LDS for these conversions. | ||
void reduceCvtOpLDSUsage(ModuleOp mod) const { | ||
mod.walk([&](triton::gpu::ConvertLayoutOp cvtOp) -> void { | ||
OpBuilder builder(cvtOp); | ||
|
||
auto srcType = cvtOp.getOperand().getType().cast<RankedTensorType>(); | ||
auto dstType = cvtOp.getType().cast<RankedTensorType>(); | ||
|
||
auto srcMfma = | ||
srcType.getEncoding().dyn_cast<triton::gpu::MfmaEncodingAttr>(); | ||
auto dstBlocked = | ||
dstType.getEncoding().dyn_cast<triton::gpu::BlockedEncodingAttr>(); | ||
|
||
if (!srcMfma || !dstBlocked) { | ||
return; | ||
} | ||
|
||
auto currLDSUsage = getCvtOpLDSUsage(cvtOp); | ||
if (currLDSUsage <= LDSSize) { | ||
return; | ||
} | ||
|
||
unsigned numWarps = | ||
srcMfma.getWarpsPerCTA()[0] * srcMfma.getWarpsPerCTA()[1]; | ||
|
||
triton::gpu::ConvertLayoutOp tmpCvt; | ||
triton::gpu::ConvertLayoutOp newEpilogueCvt; | ||
|
||
// Find all possible shapes of WarpsPerCTA by finding all possible | ||
// factorizations of numWarps. Pick shape for which both conversions in | ||
// decomposition use LDS less than LDSSize and for which sum of LDS usage | ||
// is minimal. If no such shape exists, do not decompose. | ||
unsigned minLDSUsage = 2 * LDSSize; | ||
int minIdx = -1; | ||
auto factorizedNumWarps = factorizePowerOf2(numWarps); | ||
|
||
for (int i = 0; i < factorizedNumWarps.size(); i++) { | ||
auto warpsPerCTAPair = factorizedNumWarps[i]; | ||
std::tie(tmpCvt, newEpilogueCvt) = | ||
createNewConvertOps(mod, builder, cvtOp, warpsPerCTAPair); | ||
|
||
int tmpCvtLDS = getCvtOpLDSUsage(tmpCvt); | ||
int newCvtLDS = getCvtOpLDSUsage(newEpilogueCvt); | ||
if (tmpCvtLDS <= LDSSize && newCvtLDS <= LDSSize) { | ||
int LDSUsage = tmpCvtLDS + newCvtLDS; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does lifetimes of scratch buffers of |
||
if (LDSUsage < minLDSUsage) { | ||
minLDSUsage = LDSUsage; | ||
minIdx = i; | ||
} | ||
} | ||
newEpilogueCvt.erase(); | ||
tmpCvt.erase(); | ||
} | ||
|
||
if (minIdx == -1) { | ||
return; | ||
} | ||
|
||
assert(minIdx >= 0 && minIdx < factorizedNumWarps.size()); | ||
auto warpsPerCTAPair = factorizedNumWarps[minIdx]; | ||
std::tie(tmpCvt, newEpilogueCvt) = | ||
createNewConvertOps(mod, builder, cvtOp, warpsPerCTAPair); | ||
|
||
cvtOp.replaceAllUsesWith(newEpilogueCvt.getResult()); | ||
cvtOp.erase(); | ||
}); | ||
} | ||
|
||
#endif | ||
|
||
void decomposeBlockedToDotOperand(ModuleOp mod) const { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are these lines redundant?