Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NDArray] Map NDArray ops to GPUs #974

Merged
merged 5 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/imex/Conversion/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <imex/Conversion/GPUXToLLVM/GPUXToLLVMPass.h>
#include <imex/Conversion/MathToVC/MathToVC.h>
#include <imex/Conversion/NDArrayToLinalg/NDArrayToLinalg.h>
#include <imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h>
#include <imex/Conversion/XeGPUToVC/XeGPUToVC.h>
#include <imex/Conversion/XeTileToXeGPU/XeTileToXeGPU.h>

Expand Down
13 changes: 13 additions & 0 deletions include/imex/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,19 @@ def DropRegions: Pass<"drop-regions"> {
let options = [];
}

//===----------------------------------------------------------------------===//
// ConvertRegionParallelLoopToGpu
//===----------------------------------------------------------------------===//

def ConvertRegionParallelLoopToGpu : Pass<"convert-region-parallel-loops-to-gpu"> {
let summary = "Convert mapped scf.parallel ops within GPU regions to gpu launch operations";
let description = [{
Convert scf.parallel ops within GPU regions to gpu launch operations.
}];
let constructor = "::imex::createConvertRegionParallelLoopToGpuPass()";
let dependentDialects = [];
let options = [];
}

//===----------------------------------------------------------------------===//
// GPUToSPIRV
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===- RegionParallelLoopToGpu.h -------*- C++ -*-===//
//
// Copyright 2024 Intel Corporation
// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Adds the conversion pattern from `scf.parallel` within `region.env_region`
/// to `gpu.launch`.
///
//===----------------------------------------------------------------------===//

#ifndef _RegionParallelLoopToGpu_H_INCLUDED_
#define _RegionParallelLoopToGpu_H_INCLUDED_

#include <mlir/IR/PatternMatch.h>

namespace mlir {
class Pass;
} // namespace mlir

namespace imex {
#define GEN_PASS_DECL_CONVERTREGIONPARALLELLOOPTOGPU
#include "imex/Conversion/Passes.h.inc"

/// Create a pass to convert the Region dialect to the GPU dialect.
std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass();

} // namespace imex

#endif // _RegionParallelLoopToGpu_H_INCLUDED_
2 changes: 2 additions & 0 deletions include/imex/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace imex {
std::unique_ptr<mlir::Pass> createSerializeSPIRVPass();
std::unique_ptr<mlir::Pass>
createInsertGPUAllocsPass(const char *clientAPI = "vulkan");
std::unique_ptr<mlir::Pass> createInsertGPUCopyPass();
std::unique_ptr<mlir::Pass> createSetSPIRVCapabilitiesPass();
std::unique_ptr<mlir::Pass>
createSetSPIRVAbiAttributePass(const char *clientAPI = "vulkan");
Expand All @@ -40,6 +41,7 @@ createOptimizeTransposePass(const std::string &device = "pvc");
std::unique_ptr<mlir::Pass> createHoistTransposePass();
std::unique_ptr<mlir::Pass> createVnniTransformationPass();
std::unique_ptr<mlir::Pass> createEmulateNonNativeBF16Pass();
std::unique_ptr<mlir::Pass> createTileLoopsPass();

#define GEN_PASS_DECL
#include "imex/Transforms/Passes.h.inc"
Expand Down
36 changes: 35 additions & 1 deletion include/imex/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,20 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> {
Option<"clientAPI", "client-api", "std::string", /*default=*/"\"opencl\"",
"The client API to use for inserting gpu allocs">,
Option<"inRegions", "in-regions", "bool", "false",
"Add gpu allocs only for memref.AllocOps within GPU regions">
"Add gpu allocs only for memref.AllocOps within GPU regions">,
Option<"hostShared", "host-shared", "bool", "true",
"If set, allocate shared memory accessible both on both host and device.">
];
}

def InsertGPUCopy : Pass<"insert-gpu-copy", "::mlir::func::FuncOp"> {
let summary = "Converts memref.copy op to gpu.memcpy if within an env region.";
let constructor = "imex::createInsertGPUCopyPass()";
let dependentDialects = ["::mlir::memref::MemRefDialect",
"::mlir::gpu::GPUDialect",
"::mlir::arith::ArithDialect"];
}

def SetSPIRVCapabilities : Pass<"set-spirv-capabilities"> {
let summary = "Sets Spirv capabilities";
let constructor = "imex::createSetSPIRVCapabilitiesPass()";
Expand Down Expand Up @@ -200,4 +210,28 @@ def HoistTranspose : Pass<"imex-xegpu-hoist-transpose"> {
];
}

def TileLoops : Pass<"tile-loops", "::mlir::func::FuncOp"> {
let summary = "Tile linalg.generic loops for GPU offloading";
let description = [{
Tiles loops defined with tensor inputs/outputs using the given tile sizes.
This pass should be applied after loop fusion and before bufferization.
Uses `TileUsingSCF` method. To map the loop to GPU blocks and threads this
pass should be called twice. If `in-regions` is set, only loops within GPU
regions are tiled.
}];
let options = [
ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes">,
Option<"minTileFactor", "min-tile-factor", "int64_t", "2",
"Minimum factor between dimension size and a tile size">,
Option<"inRegions", "in-regions", "bool", "false",
"Convert loops only within GPU regions">
];
let constructor = "imex::createTileLoopsPass()";
let dependentDialects = [
"::mlir::linalg::LinalgDialect",
"::mlir::scf::SCFDialect"
];
}


#endif // _IMEX_TRANSFORMS_PASSES_TD_INCLUDED_
1 change: 1 addition & 0 deletions lib/Conversion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ add_subdirectory(ArithToVC)
add_subdirectory(DistToStandard)
add_subdirectory(NDArrayToLinalg)
add_subdirectory(DropRegions)
add_subdirectory(RegionParallelLoopToGpu)
add_subdirectory(GPUToSPIRV)
add_subdirectory(GPUToGPUX)
add_subdirectory(GPUXToLLVM)
Expand Down
9 changes: 6 additions & 3 deletions lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,12 +526,15 @@ struct CopyLowering
::mlir::MemRefType::get(tTyp.getShape(), tTyp.getElementType());
auto mr = rewriter.create<::mlir::memref::AllocOp>(
loc, mrTyp, dynDims, rewriter.getI64IntegerAttr(8));
// and copy if non-0
// and copy if not zero sized
if (!retArTyp.hasZeroSize()) {
auto srcMR =
createToMemRef(loc, rewriter, src, srcArTyp.getMemRefType(src));
// create a region with given env, add copy op within it
auto env = rewriter.getStringAttr("protect_copy_op");
// wrap copy in a region to mark it non-deletable or a gpu copy
bool hasGPUEnv = ::imex::ndarray::hasGPUEnv(srcArTyp) ||
::imex::ndarray::hasGPUEnv(retArTyp);
std::string regName = hasGPUEnv ? "gpu_copy_op" : "protect_copy_op";
auto env = rewriter.getStringAttr(regName);
rewriter.create<::imex::region::EnvironmentRegionOp>(
loc, env, std::nullopt, std::nullopt,
[&srcMR, &mr](::mlir::OpBuilder &builder, ::mlir::Location loc) {
Expand Down
12 changes: 12 additions & 0 deletions lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
add_imex_conversion_library(IMEXSCFToGPU
RegionParallelLoopToGpu.cpp

ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/imex/Conversion/SCFToGPU

DEPENDS
IMEXConversionPassIncGen

LINK_LIBS PUBLIC
MLIRSCFToGPU
)
69 changes: 69 additions & 0 deletions lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
//===- RegionParallelLoopToGpu.cpp - --------------*- C++ -*-===//
//
// Copyright 2024 Intel Corporation
// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file extends upstream ParallelLoopToGpuPass by applying the transform
/// only if the parallel loop is within a GPU region
/// (`region.env_region #region.gpu_env<...>`).
///
//===----------------------------------------------------------------------===//

#include <imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h>
#include <imex/Dialect/Region/RegionUtils.h>
#include <mlir/Conversion/SCFToGPU/SCFToGPU.h>
#include <mlir/Dialect/Affine/IR/AffineOps.h>
#include <mlir/IR/PatternMatch.h>
#include <mlir/Pass/Pass.h>
#include <mlir/Transforms/DialectConversion.h>

namespace imex {
#define GEN_PASS_DEF_CONVERTREGIONPARALLELLOOPTOGPU
#include "imex/Conversion/Passes.h.inc"

namespace {
struct ConvertRegionParallelLoopToGpuPass
: public ::imex::impl::ConvertRegionParallelLoopToGpuBase<
ConvertRegionParallelLoopToGpuPass> {
ConvertRegionParallelLoopToGpuPass() = default;

void runOnOperation() override {
::mlir::RewritePatternSet patterns(&getContext());
::mlir::populateParallelLoopToGPUPatterns(patterns);
::mlir::ConversionTarget target(getContext());
target.markUnknownOpDynamicallyLegal(
[](::mlir::Operation *) { return true; });
::mlir::configureParallelLoopToGPULegality(target);

// collect all gpu regions
::mlir::SmallVector<::mlir::Operation *> ops;
getOperation()->walk([&](::imex::region::EnvironmentRegionOp op,
const ::mlir::WalkStage &stage) {
if (::imex::region::isGpuRegion(op)) {
ops.push_back(op);
return ::mlir::WalkResult::skip();
}
return ::mlir::WalkResult::advance();
});

// apply par-loop to gpu conversion to collected gpu regions
if (::mlir::failed(
::mlir::applyPartialConversion(ops, target, std::move(patterns)))) {
signalPassFailure();
}
::mlir::finalizeParallelLoopToGPUConversion(getOperation());
}
};
} // namespace

/// Create a pass to convert the Region dialect to the GPU dialect.
std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass() {
return std::make_unique<ConvertRegionParallelLoopToGpuPass>();
}

} // namespace imex
2 changes: 2 additions & 0 deletions lib/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ add_mlir_library(IMEXTransforms
CastIndex.cpp
EmulateNonNativeBF16.cpp
InsertGPUAllocs.cpp
InsertGPUCopy.cpp
LowerMemRefCopy.cpp
RemoveSingleElemVector.cpp
RemoveTemporaries.cpp
Expand All @@ -14,6 +15,7 @@ add_mlir_library(IMEXTransforms
VnniTransformation.cpp
OptimizeTranspose.cpp
HoistTranspose.cpp
TileLoops.cpp

ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/imex/Transforms
Expand Down
2 changes: 1 addition & 1 deletion lib/Transforms/InsertGPUAllocs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class InsertGPUAllocsPass final
auto allocResult = builder.create<::mlir::gpu::AllocOp>(
alloc.getLoc(), alloc.getType(), /*asyncToken*/ nullptr,
/*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
alloc.getSymbolOperands(), true);
alloc.getSymbolOperands(), /*hostShared*/ hostShared);
alloc.replaceAllUsesWith(allocResult);
alloc.erase();
}
Expand Down
89 changes: 89 additions & 0 deletions lib/Transforms/InsertGPUCopy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
//===- InsertGPUCopy.cpp - InsertGPUCopy Pass -------*- C++ -*-===//
//
// Copyright 2022 Intel Corporation
// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file replaces the memref.copy ops with gpu.memcpy ops if the
/// memref.copy resides in an environment region. This environment region must
/// be created in a prior pass where the device/host memory semantics are
/// present.
///
//===----------------------------------------------------------------------===//

#include "llvm/Support/Threading.h"
#include <imex/Transforms/Passes.h>

#include <imex/Dialect/Region/RegionUtils.h>
#include <mlir/Dialect/Affine/IR/AffineOps.h>
#include <mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h>
#include <mlir/Dialect/Func/IR/FuncOps.h>
#include <mlir/Dialect/GPU/Transforms/Passes.h>
#include <mlir/Dialect/MemRef/IR/MemRef.h>
#include <mlir/Pass/Pass.h>
#include <optional>

namespace imex {
#define GEN_PASS_DEF_INSERTGPUCOPY
#include "imex/Transforms/Passes.h.inc"
} // namespace imex

namespace {

inline bool isInEnvRegion(::mlir::Operation *op) {
if (!op)
return false;
if (!op->getParentOfType<::imex::region::EnvironmentRegionOp>())
return false;
return true;
}

class InsertGPUCopyPass final
: public imex::impl::InsertGPUCopyBase<InsertGPUCopyPass> {
public:
void runOnOperation() override {
auto func = getOperation();
auto &funcBody = func.getBody();
if (funcBody.empty()) {
return;
} else if (!llvm::hasSingleElement(funcBody)) {
func.emitError("Function must have exactly one block");
signalPassFailure();
return;
}

mlir::OpBuilder builder(func);
// collect copy ops in GPU regions
::mlir::SmallVector<::mlir::memref::CopyOp> copyOpsInGpuRegion;

// traverse ops and identify memref.copy ops which are in GPU region
(void)func.walk([&](::mlir::memref::CopyOp op) {
if (isInEnvRegion(op)) {
copyOpsInGpuRegion.emplace_back(op);
}
});

// Replace copy ops with gpu.memcpy
for (auto copyOp : copyOpsInGpuRegion) {
builder.setInsertionPoint(copyOp);
// /*asyncToken*/ std::nullopt,
builder.create<::mlir::gpu::MemcpyOp>(
copyOp.getLoc(), /*resultTypes*/ ::mlir::TypeRange{},
/*asyncDependencies*/ ::mlir::ValueRange{}, copyOp.getTarget(),
copyOp.getSource());
copyOp.erase();
}
}
};

} // namespace

namespace imex {
std::unique_ptr<mlir::Pass> createInsertGPUCopyPass() {
return std::make_unique<InsertGPUCopyPass>();
}
} // namespace imex
Loading
Loading