From 06cb9f1e0e0b681b85f34f6ec0d1841efea4249e Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Thu, 25 Apr 2024 21:41:48 +0300 Subject: [PATCH 1/5] NDArrayToLinalg: mark ndarray.copy with region env if it involves a gpu array --- lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp | 9 ++++++--- test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp b/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp index 96a5dfea6..a457382ff 100644 --- a/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp +++ b/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp @@ -526,12 +526,15 @@ struct CopyLowering ::mlir::MemRefType::get(tTyp.getShape(), tTyp.getElementType()); auto mr = rewriter.create<::mlir::memref::AllocOp>( loc, mrTyp, dynDims, rewriter.getI64IntegerAttr(8)); - // and copy if non-0 + // and copy if not zero sized if (!retArTyp.hasZeroSize()) { auto srcMR = createToMemRef(loc, rewriter, src, srcArTyp.getMemRefType(src)); - // create a region with given env, add copy op within it - auto env = rewriter.getStringAttr("protect_copy_op"); + // wrap copy in a region to mark it non-deletable or a gpu copy + bool hasGPUEnv = ::imex::ndarray::hasGPUEnv(srcArTyp) || + ::imex::ndarray::hasGPUEnv(retArTyp); + std::string regName = hasGPUEnv ? "gpu_copy_op" : "protect_copy_op"; + auto env = rewriter.getStringAttr(regName); rewriter.create<::imex::region::EnvironmentRegionOp>( loc, env, std::nullopt, std::nullopt, [&srcMR, &mr](::mlir::OpBuilder &builder, ::mlir::Location loc) { diff --git a/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir b/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir index e5aa868be..86c23cdf0 100644 --- a/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir +++ b/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir @@ -365,7 +365,7 @@ func.func @test_copy(%a: !ndarray.ndarray) -> !ndarray.ndarray { // CHECK-NEXT: tensor.dim // CHECK-NEXT: memref.alloc // CHECK-NEXT: bufferization.to_memref -// CHECK-NEXT: region.env_region "protect_copy_op" +// CHECK-NEXT: region.env_region "gpu_copy_op" // CHECK-NEXT: memref.copy // CHECK-NEXT: } // CHECK-NEXT: bufferization.to_tensor @@ -373,7 +373,7 @@ func.func @test_copy(%a: !ndarray.ndarray) -> !ndarray.ndarray { // CHECK-NEXT: tensor.dim // CHECK-NEXT: memref.alloc // CHECK-NEXT: bufferization.to_memref -// CHECK-NEXT: region.env_region "protect_copy_op" +// CHECK-NEXT: region.env_region "gpu_copy_op" // CHECK-NEXT: memref.copy // CHECK-NEXT: } // CHECK-NEXT: bufferization.to_tensor From 77774a6d4369ce47143d474199db5dd6f52e3dd9 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Fri, 26 Apr 2024 09:57:32 +0300 Subject: [PATCH 2/5] InsertGPUAllocs: add host-shared option, defaults to true --- include/imex/Transforms/Passes.td | 4 +++- lib/Transforms/InsertGPUAllocs.cpp | 2 +- test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td index b97fdf891..664ce0ec2 100644 --- a/include/imex/Transforms/Passes.td +++ b/include/imex/Transforms/Passes.td @@ -41,7 +41,9 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> { Option<"clientAPI", "client-api", "std::string", /*default=*/"\"opencl\"", "The client API to use for inserting gpu allocs">, Option<"inRegions", "in-regions", "bool", "false", - "Add gpu allocs only for memref.AllocOps within GPU regions"> + "Add gpu allocs only for memref.AllocOps within GPU regions">, + Option<"hostShared", "host-shared", "bool", "true", + "If set, allocate shared memory accessible both on both host and device."> ]; } diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp index d9e350e2c..87010e536 100644 --- a/lib/Transforms/InsertGPUAllocs.cpp +++ b/lib/Transforms/InsertGPUAllocs.cpp @@ -105,7 +105,7 @@ class InsertGPUAllocsPass final auto allocResult = builder.create<::mlir::gpu::AllocOp>( alloc.getLoc(), alloc.getType(), /*asyncToken*/ nullptr, /*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(), - alloc.getSymbolOperands(), true); + alloc.getSymbolOperands(), /*hostShared*/ hostShared); alloc.replaceAllUsesWith(allocResult); alloc.erase(); } diff --git a/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir b/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir index a85db359b..0861a69b2 100644 --- a/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir +++ b/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt --insert-gpu-allocs='in-regions=1' %s | FileCheck %s +// RUN: imex-opt --insert-gpu-allocs='in-regions=1 host-shared=0' %s | FileCheck %s func.func @test_region_alloc() { %0 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32> @@ -16,7 +16,7 @@ func.func @test_region_alloc() { // CHECK-LABEL: func.func @test_region_alloc // CHECK-NEXT: memref.alloc() {alignment = 128 : i64} : memref<2x5xf32> // CHECK-NEXT: region.env_region #region.gpu_env -> memref<2x5xf32> { -// CHECK-NEXT: gpu.alloc host_shared () : memref<2x5xf32> +// CHECK-NEXT: gpu.alloc () : memref<2x5xf32> // CHECK-NEXT: region.env_region_yield %memref : memref<2x5xf32> // CHECK-NEXT: } // CHECK: memref.dealloc %alloc : memref<2x5xf32> From 55f55a5c269cd4c557593c1b21e5f6191cfc800d Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Thu, 25 Apr 2024 23:51:08 +0300 Subject: [PATCH 3/5] Add insert-gpu-copy pass --- include/imex/Transforms/Passes.h | 1 + include/imex/Transforms/Passes.td | 8 +++ lib/Transforms/CMakeLists.txt | 1 + lib/Transforms/InsertGPUCopy.cpp | 89 ++++++++++++++++++++++++++++ test/Transforms/insert-gpu-copy.mlir | 47 +++++++++++++++ 5 files changed, 146 insertions(+) create mode 100644 lib/Transforms/InsertGPUCopy.cpp create mode 100644 test/Transforms/insert-gpu-copy.mlir diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h index 6e6f7d1b6..f6e9baece 100644 --- a/include/imex/Transforms/Passes.h +++ b/include/imex/Transforms/Passes.h @@ -25,6 +25,7 @@ namespace imex { std::unique_ptr createSerializeSPIRVPass(); std::unique_ptr createInsertGPUAllocsPass(const char *clientAPI = "vulkan"); +std::unique_ptr createInsertGPUCopyPass(); std::unique_ptr createSetSPIRVCapabilitiesPass(); std::unique_ptr createSetSPIRVAbiAttributePass(const char *clientAPI = "vulkan"); diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td index 664ce0ec2..67630726c 100644 --- a/include/imex/Transforms/Passes.td +++ b/include/imex/Transforms/Passes.td @@ -47,6 +47,14 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> { ]; } +def InsertGPUCopy : Pass<"insert-gpu-copy", "::mlir::func::FuncOp"> { + let summary = "Converts memref.copy op to gpu.memcpy if within an env region."; + let constructor = "imex::createInsertGPUCopyPass()"; + let dependentDialects = ["::mlir::memref::MemRefDialect", + "::mlir::gpu::GPUDialect", + "::mlir::arith::ArithDialect"]; +} + def SetSPIRVCapabilities : Pass<"set-spirv-capabilities"> { let summary = "Sets Spirv capabilities"; let constructor = "imex::createSetSPIRVCapabilitiesPass()"; diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt index 47d7e21b7..b1b2a6bf3 100644 --- a/lib/Transforms/CMakeLists.txt +++ b/lib/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_library(IMEXTransforms CastIndex.cpp EmulateNonNativeBF16.cpp InsertGPUAllocs.cpp + InsertGPUCopy.cpp LowerMemRefCopy.cpp RemoveSingleElemVector.cpp RemoveTemporaries.cpp diff --git a/lib/Transforms/InsertGPUCopy.cpp b/lib/Transforms/InsertGPUCopy.cpp new file mode 100644 index 000000000..50af43f92 --- /dev/null +++ b/lib/Transforms/InsertGPUCopy.cpp @@ -0,0 +1,89 @@ +//===- InsertGPUCopy.cpp - InsertGPUCopy Pass -------*- C++ -*-===// +// +// Copyright 2022 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file replaces the memref.copy ops with gpu.memcpy ops if the +/// memref.copy resides in an environment region. This environment region must +/// be created in a prior pass where the device/host memory semantics are +/// present. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Threading.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace imex { +#define GEN_PASS_DEF_INSERTGPUCOPY +#include "imex/Transforms/Passes.h.inc" +} // namespace imex + +namespace { + +inline bool isInEnvRegion(::mlir::Operation *op) { + if (!op) + return false; + if (!op->getParentOfType<::imex::region::EnvironmentRegionOp>()) + return false; + return true; +} + +class InsertGPUCopyPass final + : public imex::impl::InsertGPUCopyBase { +public: + void runOnOperation() override { + auto func = getOperation(); + auto &funcBody = func.getBody(); + if (funcBody.empty()) { + return; + } else if (!llvm::hasSingleElement(funcBody)) { + func.emitError("Function must have exactly one block"); + signalPassFailure(); + return; + } + + mlir::OpBuilder builder(func); + // collect copy ops in GPU regions + ::mlir::SmallVector<::mlir::memref::CopyOp> copyOpsInGpuRegion; + + // traverse ops and identify memref.copy ops which are in GPU region + (void)func.walk([&](::mlir::memref::CopyOp op) { + if (isInEnvRegion(op)) { + copyOpsInGpuRegion.emplace_back(op); + } + }); + + // Replace copy ops with gpu.memcpy + for (auto copyOp : copyOpsInGpuRegion) { + builder.setInsertionPoint(copyOp); + // /*asyncToken*/ std::nullopt, + builder.create<::mlir::gpu::MemcpyOp>( + copyOp.getLoc(), /*resultTypes*/ ::mlir::TypeRange{}, + /*asyncDependencies*/ ::mlir::ValueRange{}, copyOp.getTarget(), + copyOp.getSource()); + copyOp.erase(); + } + } +}; + +} // namespace + +namespace imex { +std::unique_ptr createInsertGPUCopyPass() { + return std::make_unique(); +} +} // namespace imex diff --git a/test/Transforms/insert-gpu-copy.mlir b/test/Transforms/insert-gpu-copy.mlir new file mode 100644 index 000000000..434711da2 --- /dev/null +++ b/test/Transforms/insert-gpu-copy.mlir @@ -0,0 +1,47 @@ +// RUN: imex-opt --split-input-file --insert-gpu-copy %s -verify-diagnostics -o -| FileCheck %s + +func.func @test_copy_gpu_region() -> (memref, memref) { + %c5 = arith.constant 5 : index + %alloc = memref.alloc(%c5, %c5) : memref + %0 = region.env_region #region.gpu_env -> memref { + %memref = gpu.alloc (%c5, %c5) : memref + memref.copy %alloc, %memref : memref to memref + region.env_region_yield %memref : memref + } + return %alloc, %0 : memref, memref +} +// CHECK-LABEL: func.func @test_copy_gpu_region +// CHECK: [[ALLOC:%.*]] = memref.alloc +// CHECK: region.env_region #region.gpu_env +// CHECK-NEXT: [[ALLOC2:%.*]] = gpu.alloc +// CHECK-NEXT: gpu.memcpy [[ALLOC2]], [[ALLOC]] : memref, memref +// CHECK-NEXT: region.env_region_yield [[ALLOC2]] : memref + +func.func @test_copy_region() -> (memref, memref) { + %c5 = arith.constant 5 : index + %alloc = memref.alloc(%c5, %c5) : memref + %0 = region.env_region "string_attr" -> memref { + %memref = gpu.alloc (%c5, %c5) : memref + memref.copy %alloc, %memref : memref to memref + region.env_region_yield %memref : memref + } + return %alloc, %0 : memref, memref +} +// CHECK-LABEL: func.func @test_copy_region +// CHECK: [[ALLOC:%.*]] = memref.alloc +// CHECK: region.env_region "string_attr" +// CHECK-NEXT: [[ALLOC2:%.*]] = gpu.alloc +// CHECK-NEXT: gpu.memcpy [[ALLOC2]], [[ALLOC]] : memref, memref +// CHECK-NEXT: region.env_region_yield [[ALLOC2]] : memref + +func.func @test_copy() -> (memref, memref) { + %c5 = arith.constant 5 : index + %alloc = memref.alloc(%c5, %c5) : memref + %memref = memref.alloc(%c5, %c5) : memref + memref.copy %alloc, %memref : memref to memref + return %alloc, %memref : memref, memref +} +// CHECK-LABEL: func.func @test_copy +// CHECK: [[ALLOC:%.*]] = memref.alloc +// CHECK: [[MEMREF:%.*]] = memref.alloc +// CHECK: memref.copy [[ALLOC]], [[MEMREF]] : memref to memref From 371a6b4f5ce819394bfdc73551b937f7f688cd77 Mon Sep 17 00:00:00 2001 From: Frank Schlimbach Date: Mon, 6 May 2024 16:00:25 +0200 Subject: [PATCH 4/5] add convert-region-parallel-loops-to-gpu pass converting parloops only in gpu regions Ported from #441, fixed compilation and updated tests --- include/imex/Conversion/Passes.h | 1 + include/imex/Conversion/Passes.td | 13 ++++ .../RegionParallelLoopToGpu/CMakeLists.txt | 0 .../RegionParallelLoopToGpu.h | 34 +++++++++ lib/Conversion/CMakeLists.txt | 1 + .../RegionParallelLoopToGpu/CMakeLists.txt | 12 ++++ .../RegionParallelLoopToGpu.cpp | 69 +++++++++++++++++++ .../SCFToGPU/RegionParallelLoopToGpu.mlir | 69 +++++++++++++++++++ 8 files changed, 199 insertions(+) create mode 100644 include/imex/Conversion/RegionParallelLoopToGpu/CMakeLists.txt create mode 100644 include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h create mode 100644 lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt create mode 100644 lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp create mode 100644 test/Conversion/SCFToGPU/RegionParallelLoopToGpu.mlir diff --git a/include/imex/Conversion/Passes.h b/include/imex/Conversion/Passes.h index 9fa75ca1e..c868dc887 100644 --- a/include/imex/Conversion/Passes.h +++ b/include/imex/Conversion/Passes.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td index 2fffae412..d0e198eb2 100644 --- a/include/imex/Conversion/Passes.td +++ b/include/imex/Conversion/Passes.td @@ -109,6 +109,19 @@ def DropRegions: Pass<"drop-regions"> { let options = []; } +//===----------------------------------------------------------------------===// +// ConvertRegionParallelLoopToGpu +//===----------------------------------------------------------------------===// + +def ConvertRegionParallelLoopToGpu : Pass<"convert-region-parallel-loops-to-gpu"> { + let summary = "Convert mapped scf.parallel ops within GPU regions to gpu launch operations"; + let description = [{ + Convert scf.parallel ops within GPU regions to gpu launch operations. + }]; + let constructor = "::imex::createConvertRegionParallelLoopToGpuPass()"; + let dependentDialects = []; + let options = []; +} //===----------------------------------------------------------------------===// // GPUToSPIRV diff --git a/include/imex/Conversion/RegionParallelLoopToGpu/CMakeLists.txt b/include/imex/Conversion/RegionParallelLoopToGpu/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h b/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h new file mode 100644 index 000000000..d4f0437d8 --- /dev/null +++ b/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h @@ -0,0 +1,34 @@ +//===- RegionParallelLoopToGpu.h -------*- C++ -*-===// +// +// Copyright 2024 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Adds the conversion pattern from `scf.parallel` within `region.env_region` +/// to `gpu.launch`. +/// +//===----------------------------------------------------------------------===// + +#ifndef _RegionParallelLoopToGpu_H_INCLUDED_ +#define _RegionParallelLoopToGpu_H_INCLUDED_ + +#include + +namespace mlir { +class Pass; +} // namespace mlir + +namespace imex { +#define GEN_PASS_DECL_CONVERTREGIONPARALLELLOOPTOGPU +#include "imex/Conversion/Passes.h.inc" + +/// Create a pass to convert the Region dialect to the GPU dialect. +std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass(); + +} // namespace imex + +#endif // _RegionParallelLoopToGpu_H_INCLUDED_ diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index 715ce1d20..9828ea59d 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(ArithToVC) add_subdirectory(DistToStandard) add_subdirectory(NDArrayToLinalg) add_subdirectory(DropRegions) +add_subdirectory(RegionParallelLoopToGpu) add_subdirectory(GPUToSPIRV) add_subdirectory(GPUToGPUX) add_subdirectory(GPUXToLLVM) diff --git a/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt b/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt new file mode 100644 index 000000000..868f59d40 --- /dev/null +++ b/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt @@ -0,0 +1,12 @@ +add_imex_conversion_library(IMEXSCFToGPU + RegionParallelLoopToGpu.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/imex/Conversion/SCFToGPU + + DEPENDS + IMEXConversionPassIncGen + + LINK_LIBS PUBLIC + MLIRSCFToGPU +) diff --git a/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp b/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp new file mode 100644 index 000000000..20279bdfd --- /dev/null +++ b/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp @@ -0,0 +1,69 @@ +//===- RegionParallelLoopToGpu.cpp - --------------*- C++ -*-===// +// +// Copyright 2024 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file extends upstream ParallelLoopToGpuPass by applying the transform +/// only if the parallel loop is within a GPU region +/// (`region.env_region #region.gpu_env<...>`). +/// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +namespace imex { +#define GEN_PASS_DEF_CONVERTREGIONPARALLELLOOPTOGPU +#include "imex/Conversion/Passes.h.inc" + +namespace { +struct ConvertRegionParallelLoopToGpuPass + : public ::imex::impl::ConvertRegionParallelLoopToGpuBase< + ConvertRegionParallelLoopToGpuPass> { + ConvertRegionParallelLoopToGpuPass() = default; + + void runOnOperation() override { + ::mlir::RewritePatternSet patterns(&getContext()); + ::mlir::populateParallelLoopToGPUPatterns(patterns); + ::mlir::ConversionTarget target(getContext()); + target.markUnknownOpDynamicallyLegal( + [](::mlir::Operation *) { return true; }); + ::mlir::configureParallelLoopToGPULegality(target); + + // collect all gpu regions + ::mlir::SmallVector<::mlir::Operation *> ops; + getOperation()->walk([&](::imex::region::EnvironmentRegionOp op, + const ::mlir::WalkStage &stage) { + if (::imex::region::isGpuRegion(op)) { + ops.push_back(op); + return ::mlir::WalkResult::skip(); + } + return ::mlir::WalkResult::advance(); + }); + + // apply par-loop to gpu conversion to collected gpu regions + if (::mlir::failed( + ::mlir::applyPartialConversion(ops, target, std::move(patterns)))) { + signalPassFailure(); + } + ::mlir::finalizeParallelLoopToGPUConversion(getOperation()); + } +}; +} // namespace + +/// Create a pass to convert the Region dialect to the GPU dialect. +std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass() { + return std::make_unique(); +} + +} // namespace imex diff --git a/test/Conversion/SCFToGPU/RegionParallelLoopToGpu.mlir b/test/Conversion/SCFToGPU/RegionParallelLoopToGpu.mlir new file mode 100644 index 000000000..8e1d09a8a --- /dev/null +++ b/test/Conversion/SCFToGPU/RegionParallelLoopToGpu.mlir @@ -0,0 +1,69 @@ +// RUN: imex-opt --split-input-file -convert-region-parallel-loops-to-gpu %s -verify-diagnostics -o -| FileCheck %s + +// 2-d parallel loop mapped to block.y and block.x + +// ----- +func.func @test_convert_region_parloop_gpu(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, + %buf : memref, + %res : memref) { + %step = arith.constant 2 : index + region.env_region #region.gpu_env { + scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = memref.load %buf[%i0, %i1] : memref + memref.store %val, %res[%i1, %i0] : memref + } { mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>] } + region.env_region_yield + } + return +} +// CHECK: test_convert_region_parloop_gpu +// CHECK: region.env_region #region.gpu_env +// CHECK: gpu.launch + +// ----- +func.func @test_convert_region_parloop_cpu(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, + %buf : memref, + %res : memref) { + %step = arith.constant 2 : index + scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = memref.load %buf[%i0, %i1] : memref + memref.store %val, %res[%i1, %i0] : memref + } { mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>] } + return +} +// CHECK: test_convert_region_parloop_cpu +// CHECK: scf.parallel + +// ----- +func.func @test_convert_region_parloop_combined(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, + %buf : memref, + %res : memref) { + %step = arith.constant 2 : index + region.env_region "something" { + region.env_region #region.gpu_env { + scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = memref.load %buf[%i0, %i1] : memref + memref.store %val, %res[%i1, %i0] : memref + } { mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>] } + region.env_region_yield + } + + scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = memref.load %buf[%i0, %i1] : memref + memref.store %val, %res[%i1, %i0] : memref + } { mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>] } + } + return +} +// CHECK: test_convert_region_parloop_combined +// CHECK: region.env_region "something" { +// CHECK: region.env_region #region.gpu_env +// CHECK: gpu.launch +// CHECK: scf.parallel From 412577ed43fa4324275529aaa9bfeb0cedde8b48 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Fri, 5 Jul 2024 19:29:10 +0300 Subject: [PATCH 5/5] Add tile-loops pass --- include/imex/Transforms/Passes.h | 1 + include/imex/Transforms/Passes.td | 24 +++++ lib/Transforms/CMakeLists.txt | 1 + lib/Transforms/TileLoops.cpp | 141 ++++++++++++++++++++++++++++++ test/Transforms/tile-loops.mlir | 38 ++++++++ 5 files changed, 205 insertions(+) create mode 100644 lib/Transforms/TileLoops.cpp create mode 100644 test/Transforms/tile-loops.mlir diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h index f6e9baece..7392e2c8b 100644 --- a/include/imex/Transforms/Passes.h +++ b/include/imex/Transforms/Passes.h @@ -41,6 +41,7 @@ createOptimizeTransposePass(const std::string &device = "pvc"); std::unique_ptr createHoistTransposePass(); std::unique_ptr createVnniTransformationPass(); std::unique_ptr createEmulateNonNativeBF16Pass(); +std::unique_ptr createTileLoopsPass(); #define GEN_PASS_DECL #include "imex/Transforms/Passes.h.inc" diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td index 67630726c..ac9220a36 100644 --- a/include/imex/Transforms/Passes.td +++ b/include/imex/Transforms/Passes.td @@ -210,4 +210,28 @@ def HoistTranspose : Pass<"imex-xegpu-hoist-transpose"> { ]; } +def TileLoops : Pass<"tile-loops", "::mlir::func::FuncOp"> { + let summary = "Tile linalg.generic loops for GPU offloading"; + let description = [{ + Tiles loops defined with tensor inputs/outputs using the given tile sizes. + This pass should be applied after loop fusion and before bufferization. + Uses `TileUsingSCF` method. To map the loop to GPU blocks and threads this + pass should be called twice. If `in-regions` is set, only loops within GPU + regions are tiled. + }]; + let options = [ + ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes">, + Option<"minTileFactor", "min-tile-factor", "int64_t", "2", + "Minimum factor between dimension size and a tile size">, + Option<"inRegions", "in-regions", "bool", "false", + "Convert loops only within GPU regions"> + ]; + let constructor = "imex::createTileLoopsPass()"; + let dependentDialects = [ + "::mlir::linalg::LinalgDialect", + "::mlir::scf::SCFDialect" + ]; +} + + #endif // _IMEX_TRANSFORMS_PASSES_TD_INCLUDED_ diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt index b1b2a6bf3..071397eca 100644 --- a/lib/Transforms/CMakeLists.txt +++ b/lib/Transforms/CMakeLists.txt @@ -15,6 +15,7 @@ add_mlir_library(IMEXTransforms VnniTransformation.cpp OptimizeTranspose.cpp HoistTranspose.cpp + TileLoops.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/imex/Transforms diff --git a/lib/Transforms/TileLoops.cpp b/lib/Transforms/TileLoops.cpp new file mode 100644 index 000000000..edd358662 --- /dev/null +++ b/lib/Transforms/TileLoops.cpp @@ -0,0 +1,141 @@ +//===- TileLoops.cpp ------------------------------------*- C++ -*-===// +// +// Copyright 2023 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the TileLoops transform which tiles loops for GPU +/// mapping. +/// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "llvm/Support/Threading.h" +#include +#include + +namespace imex { +#define GEN_PASS_DEF_TILELOOPS +#include "imex/Transforms/Passes.h.inc" +} // namespace imex + +#define DEBUG_TYPE "tile-loops" + +#ifndef NDEBUG +#define DEBUG_MSG(PREFIX, MSG) \ + LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << "\n"); +#define DEBUG_OP(PREFIX, MSG, OP) \ + LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << " '" << OP->getName() \ + << "' " << OP->getLoc() << "\n"); +#define DEBUG_OP_VEC(PREFIX, MSG, OPVEC) \ + LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << " (" << OPVEC.size() \ + << ")\n"); \ + for (auto op : OPVEC) { \ + DEBUG_OP(PREFIX, " ", op) \ + } +#endif + +using namespace imex; + +namespace { + +static ::mlir::FailureOr<::mlir::SmallVector> +getDefaultTileSizes(::mlir::linalg::LinalgOp linalgOp, + ::mlir::ArrayRef userProvidedTiles) { + // The user-provided tiles are considered from the outer + // most loop. If not enough tiles are provided we pad with + // zeros. + if (!userProvidedTiles.empty()) { + size_t numParallelLoops = linalgOp.getNumParallelLoops(); + size_t nonZeros = 0; + for (auto tile : userProvidedTiles) + if (tile != 0) + nonZeros++; + if (nonZeros > numParallelLoops || + userProvidedTiles.size() > linalgOp.getNumLoops()) { + return ::mlir::failure(); + } + + ::mlir::SmallVector userTiles(linalgOp.getNumLoops(), 0); + for (auto tile : ::llvm::enumerate(userProvidedTiles)) + userTiles[tile.index()] = tile.value(); + return userTiles; + } + return ::mlir::failure(); +} + +struct TileLoops final : public imex::impl::TileLoopsBase { + + using TileLoopsBase::TileLoopsBase; + + void runOnOperation() override { + + ::mlir::func::FuncOp func = getOperation(); + ::mlir::IRRewriter rewriter(&getContext()); + transform(rewriter, func, this->tileSizes, this->minTileFactor); + + return; + } + +private: + void transform(::mlir::RewriterBase &rewriter, ::mlir::func::FuncOp func, + ::mlir::ArrayRef tileSizes, int64_t minTileFactor) { + DEBUG_MSG("tile-loops", "Entering transform"); + ::mlir::SmallVector<::mlir::Operation *> allLinalgOps; + func->walk([&](::mlir::linalg::LinalgOp linalgOp) { + if (!inRegions || ::imex::region::isInGpuRegion(linalgOp)) { + allLinalgOps.push_back(linalgOp); + } + }); + DEBUG_OP_VEC("tile-loops", " Found linalg ops", allLinalgOps); + + for (auto op : allLinalgOps) { + DEBUG_OP("tile-loops", " Tiling op:", op); + auto tiles = getDefaultTileSizes( + ::llvm::cast<::mlir::linalg::LinalgOp>(op), tileSizes); + if (failed(tiles)) { + DEBUG_MSG("tile-loops", + " Failed to compute default tile sizes. Aborting."); + return; + } + DEBUG_MSG("tile-loops", " tile sizes:"); + LLVM_DEBUG(llvm::dbgs() << "tile-loops: ("); + LLVM_DEBUG(llvm::interleaveComma(*tiles, llvm::dbgs())); + LLVM_DEBUG(llvm::dbgs() << ")\n"); + + auto tilesRes = + ::mlir::getAsOpFoldResult(rewriter.getI64ArrayAttr(*tiles)); + ::mlir::scf::SCFTilingOptions options; + options.setTileSizes(tilesRes); + options.setLoopType(::mlir::scf::SCFTilingOptions::LoopType::ForallOp); + auto tileOp = ::mlir::cast<::mlir::TilingInterface>(op); + ::mlir::FailureOr<::mlir::scf::SCFTilingResult> tilingResult = + mlir::scf::tileUsingSCF(rewriter, tileOp, options); + if (failed(tilingResult)) { + DEBUG_MSG("tile-loops", " Failed to tile op. Aborting."); + return; + } + DEBUG_MSG("tile-loops", " Tiling applied successfully."); + rewriter.replaceOp(op, tilingResult.value().replacements); + } + } +}; + +} // end anonymous namespace + +namespace imex { +std::unique_ptr createTileLoopsPass() { + return std::make_unique(); +} +} // namespace imex diff --git a/test/Transforms/tile-loops.mlir b/test/Transforms/tile-loops.mlir new file mode 100644 index 000000000..098999f1a --- /dev/null +++ b/test/Transforms/tile-loops.mlir @@ -0,0 +1,38 @@ +// RUN: imex-opt --split-input-file -tile-loops='tile-sizes=32' -tile-loops='tile-sizes=1' %s -verify-diagnostics -o -| FileCheck %s + +#map = affine_map<(d0) -> (d0)> +module { + func.func @add(%arg0: tensor<129xf32>, %arg1: tensor<129xf32>, %arg2: tensor<129xf32>) -> tensor<129xf32> { + %0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<129xf32>, tensor<129xf32>) outs(%arg2 : tensor<129xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %1 = arith.addf %in, %in_0 : f32 + linalg.yield %1 : f32 + } -> tensor<129xf32> + return %0 : tensor<129xf32> + } +} +// CHECK-LABEL: func.func @add +// CHECK-NEXT: %[[FORALL:.*]] = scf.forall (%arg3) = (0) to (129) step (32) shared_outs(%arg4 = %arg2) -> (tensor<129xf32>) { +// CHECK-NEXT: %[[C129:.*]] = arith.constant 129 : index +// CHECK-NEXT: %[[MIN:.*]] = affine.min #map(%[[ARG3:.*]]) +// CHECK-NEXT: %[[APPLY1:.*]] = affine.apply #map1(%[[MIN]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %arg0[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor +// CHECK-NEXT: %[[EXTRACTED_SLICE_0:.*]] = tensor.extract_slice %arg1[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor +// CHECK-NEXT: %[[EXTRACTED_SLICE_1:.*]] = tensor.extract_slice %arg4[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor +// CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[FORALL:.*]] = scf.forall (%[[ARG5:.*]]) in (%[[MIN]]) shared_outs(%[[ARG6:.*]] = %[[EXTRACTED_SLICE_1]]) -> (tensor) { +// CHECK-NEXT: %[[EXTRACTED_SLICE_4:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG5]]] [1] [1] : tensor to tensor<1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_0]][%[[ARG5]]] [1] [1] : tensor to tensor<1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_6:.*]] = tensor.extract_slice %[[ARG6]][%[[ARG5]]] [1] [1] : tensor to tensor<1xf32> +// CHECK-NEXT: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%[[EXTRACTED_SLICE_4]], %[[EXTRACTED_SLICE_5]] : tensor<1xf32>, tensor<1xf32>) outs(%[[EXTRACTED_SLICE_6]] : tensor<1xf32>) { +// CHECK-NEXT: ^bb0(%[[IN:.*]]: f32, %[[IN_7:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK-NEXT: %[[ADDF:.*]] = arith.addf %[[IN]], %[[IN_7]] : f32 +// CHECK-NEXT: linalg.yield %[[ADDF]] : f32 +// CHECK-NEXT: } -> tensor<1xf32> +// CHECK-NEXT: scf.forall.in_parallel { +// CHECK-NEXT: tensor.parallel_insert_slice %[[GENERIC]] into %[[ARG6]][%[[ARG5]]] [1] [1] : tensor<1xf32> into tensor +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK: scf.forall.in_parallel { +// CHECK-NEXT: tensor.parallel_insert_slice %[[FORALL]] into %arg4[%[[ARG3]]] [%[[MIN]]] [1] : tensor into tensor<129xf32> +// CHECK-NEXT: }