Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreyPavlenko committed Oct 24, 2024
1 parent 7d157e7 commit 04326d8
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 28 deletions.
36 changes: 16 additions & 20 deletions lib/gc/Transforms/GPU/GpuLoopTiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,34 +41,30 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,

void runOnOperation() override {
IRRewriter rewriter(&getContext());
auto euThreads = static_cast<double>(getEuThreads(rewriter));
size_t euThreads = getEuThreads(rewriter);
getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
if (!loop->getParentOfType<scf::ParallelOp>()) {
tile(loop, euThreads);
SmallVector<int64_t> loopSizes;
auto steps = loop.getStep();
loopSizes.reserve(steps.size());

for (auto step : steps) {
if (auto v = getConstIdxValue(step)) {
loopSizes.push_back(v);
} else {
loopSizes.push_back(32);
}
}

SmallVector<int64_t> tileSizes;
normaliseTiles(euThreads, loopSizes, tileSizes);
tileParallelLoop(loop, tileSizes, false);
}
return WalkResult::skip();
});
if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
gcLogD("Failed to simplify regions");
}
}

private:
static void tile(scf::ParallelOp loop, double euThreads) {
SmallVector<int64_t> tileSizes;
auto steps = loop.getStep();
tileSizes.reserve(steps.size());

for (auto step : steps) {
if (auto v = getConstIdxValue(step)) {
tileSizes.push_back(static_cast<int64_t>(
std::ceil(static_cast<double>(v) / euThreads)));
} else {
tileSizes.push_back(32);
}
}

tileParallelLoop(loop, tileSizes, false);
}
};
} // namespace
40 changes: 32 additions & 8 deletions lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,21 @@ struct GpuTilingAndFusion final
void runOnOperation() override {
IRRewriter rewriter(&getContext());
scf::SCFTileAndFuseOptions opts;
opts.setFusionControlFn(
[&](tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
bool isDestinationOperand)
-> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
Operation *op = originalProducer.getOwner();
if (!op) {
return std::nullopt;
}
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
if (!linalgOp.hasOnlyProjectedPermutations()) {
return std::nullopt;
}
}
return scf::SCFTileAndFuseOptions::ControlFnResult{};
});
opts.tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
// The outer loop is converted to a GPU kernel and the tile sizes are mapped
// to the grid sizes.
Expand Down Expand Up @@ -77,13 +92,15 @@ struct GpuTilingAndFusion final
assert(itTypes.size() == itDomains.size());

// TODO: Add a parameter to the options?
size_t totalSize = calcOperandsSize(op) * euThreads;
size_t totalSize = calcOperandsSize(op);
unsigned loopCount = 0;
SmallVector<int64_t> sizes;

for (auto [t, r] : zip(itTypes, itDomains)) {
if (t == utils::IteratorType::parallel) {
if (auto v = getConstantIntValue(r.size)) {
loopCount++;
sizes.emplace_back(*v);
totalSize *= *v;
} else {
return calcDynamicSizes(builder, ti, euMem, euThreads);
Expand All @@ -95,19 +112,25 @@ struct GpuTilingAndFusion final
return {};
}

// TODO: In case of different sizes, calculate the ratio for each loop
double ratio = std::pow(static_cast<double>(totalSize) /
static_cast<double>(euMem),
1.0 / loopCount);
ratio = std::max(1.0, ratio);
auto outerTileSize = static_cast<size_t>(
std::ceil(static_cast<double>(euMem) /
static_cast<double>(calcOperandsSize(op))));
SmallVector<int64_t> outerTiles;
SmallVector<int64_t> innerTiles;
normaliseTiles(outerTileSize, sizes, outerTiles);
normaliseTiles(euThreads, sizes, innerTiles);

unsigned counter = 0;
SmallVector<OpFoldResult> tiles;
tiles.reserve(itDomains.size());

for (auto [t, r] : zip(itTypes, itDomains)) {
if (t != utils::IteratorType::parallel) {
tiles.emplace_back(builder.getIndexAttr(1));
} else if (auto v = getConstantIntValue(r.size)) {
tiles.emplace_back(ceil(builder, *v, ratio));
tiles.emplace_back(
ceil(builder, outerTiles[counter], innerTiles[counter]));
counter++;
} else {
abort(); // Must never get here
}
Expand Down Expand Up @@ -174,7 +197,8 @@ struct GpuTilingAndFusion final
static std::optional<TilingInterface> findTi(Operation *op) {
std::optional<TilingInterface> last;
op->walk<WalkOrder::PreOrder>([&](linalg::LinalgOp linalgOp) {
if (!linalgOp->getParentOfType<scf::ForallOp>()) {
if (linalgOp.hasOnlyProjectedPermutations() &&
!linalgOp->getParentOfType<scf::ForallOp>()) {
if (auto ti = dyn_cast<TilingInterface>(linalgOp.getOperation())) {
last = ti;
}
Expand Down
26 changes: 26 additions & 0 deletions lib/gc/Transforms/GPU/GpuUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#ifndef GPUUTILS_H
#define GPUUTILS_H

#include <numeric>

#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Interfaces/DataLayoutInterfaces.h"
Expand Down Expand Up @@ -69,4 +71,28 @@ static int64_t getConstIdxValue(Value value) {
}
return 0;
}

static void normaliseTiles(size_t totalSize, SmallVector<int64_t> &loopSizes,
SmallVector<int64_t> &tiles) {
size_t loopCount = loopSizes.size();
assert(loopCount > 0);
std::vector<std::pair<int64_t, size_t>> sorted;
sorted.reserve(loopCount);
for (size_t i = 0; i < loopCount; ++i) {
sorted.emplace_back(loopSizes[i], i);
}
std::sort(sorted.begin(), sorted.end());
tiles.assign(loopCount, 1);

// Distribute the totalSize among the tiles
for (size_t i = 0; i < loopCount; ++i) {
auto factor = static_cast<int64_t>(
std::pow(totalSize, 1.0 / static_cast<double>(loopCount - i)));
if (factor >= sorted[i].first) {
factor = sorted[i].first;
}
tiles[sorted[i].second] = factor;
totalSize /= factor;
}
}
#endif

0 comments on commit 04326d8

Please sign in to comment.