Skip to content

Commit

Permalink
Reworked
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreyPavlenko committed Oct 31, 2024
1 parent 87a9729 commit 7656d44
Show file tree
Hide file tree
Showing 8 changed files with 400 additions and 147 deletions.
33 changes: 22 additions & 11 deletions include/gc/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -124,29 +124,40 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
let summary = "GPU tiling and fusion path.";
let description = [{
This path tiles linalg operations and wraps into foreach loops.
The tiles calculation is based on the Execution Unit cache size and the number of threads per EU.
This pass tiles linalg operations and creates an inner loop that is mapped to the block sizes, when converting
to gpu.launch. The tiles calculation is based on the GPU device properties, retrieved from the DLTI attributes.
If the DLTI attributes are not specified, defaults to the pass options.
}];
let options = [
Option<"euMem", "eu-mem", "size_t",
Option<"numEus", "num-eus", "size_t",
/*default=*/"448",
"Number of Execution Units.">,
Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
/*default=*/"8",
"Number of Execution Units per slice.">,
Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
/*default=*/"8",
"Number of threads per Execution Unit.">,
Option<"cacheSize", "cache-size", "size_t",
/*default=*/"131072",
"Execution Unit cache size.">,
Option<"euThreads", "eu-threads", "size_t",
/*default=*/"8",
"Number of threads per EU.">
Option<"vectorWidth", "vector-width", "size_t",
/*default=*/"512",
"The maximum width of EU's vector registers.">
];
}

def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
let summary = "Create nested parallel loops to be mapped to GPU.";
let description = [{
This path tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops.
Each tile of the outer loop is divided by the number of threads per EU.
This pass tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops. The tiles
calculation is based on the max_work_group_size DLTI attribute. If the attribute is not specified,
defaults to the pass options.
}];
let options = [
Option<"euThreads", "eu-threads", "size_t",
/*default=*/"8",
"Number of threads per Execution Unit.">
Option<"workGroupSize", "work-group-size", "size_t",
/*default=*/"64",
"The maximum workgroup size.">
];
}
#endif // GC_USE_IMEX
Expand Down
39 changes: 17 additions & 22 deletions lib/gc/Transforms/GPU/GpuLoopTiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include "gc/Utils/Log.h"

using namespace mlir;
// using namespace mlir::gc::gpu;
using namespace mlir::gc;

namespace mlir::gc {
#define GEN_PASS_DECL_GPULOOPTILING
Expand All @@ -41,34 +41,29 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,

void runOnOperation() override {
IRRewriter rewriter(&getContext());
auto euThreads = static_cast<double>(getEuThreads(rewriter));
getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
auto wgSize = getWorkGroupSize(rewriter);
getOperation().walk<WalkOrder::PreOrder>([wgSize](scf::ParallelOp loop) {
if (!loop->getParentOfType<scf::ParallelOp>()) {
tile(loop, euThreads);
SmallVector<int64_t> tiles;
auto steps = loop.getStep();
tiles.reserve(steps.size());

for (auto step : steps) {
if (auto v = getConstIdxValue(step)) {
tiles.push_back(v);
} else {
tiles.push_back(32);
}
}

adjustTiles(wgSize, tiles);
tileParallelLoop(loop, tiles, false);
}
return WalkResult::skip();
});
if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
gcLogD("Failed to simplify regions");
}
}

private:
static void tile(scf::ParallelOp loop, double euThreads) {
SmallVector<int64_t> tileSizes;
auto steps = loop.getStep();
tileSizes.reserve(steps.size());

for (auto step : steps) {
if (auto v = getConstIdxValue(step)) {
tileSizes.push_back(static_cast<int64_t>(
std::ceil(static_cast<double>(v) / euThreads)));
} else {
tileSizes.push_back(32);
}
}

tileParallelLoop(loop, tileSizes, false);
}
};
} // namespace
Loading

0 comments on commit 7656d44

Please sign in to comment.