intel · fschlimb · Dec 17, 2024 · Apr 25, 2024 · Apr 26, 2024 · Apr 25, 2024
diff --git a/include/imex/Conversion/Passes.h b/include/imex/Conversion/Passes.h
@@ -25,6 +25,7 @@
 #include <imex/Conversion/GPUXToLLVM/GPUXToLLVMPass.h>
 #include <imex/Conversion/MathToVC/MathToVC.h>
 #include <imex/Conversion/NDArrayToLinalg/NDArrayToLinalg.h>
+#include <imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h>
 #include <imex/Conversion/XeGPUToVC/XeGPUToVC.h>
 #include <imex/Conversion/XeTileToXeGPU/XeTileToXeGPU.h>
 

diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td
@@ -109,6 +109,19 @@ def DropRegions: Pass<"drop-regions"> {
   let options = [];
 }
 
+//===----------------------------------------------------------------------===//
+// ConvertRegionParallelLoopToGpu
+//===----------------------------------------------------------------------===//
+
+def ConvertRegionParallelLoopToGpu : Pass<"convert-region-parallel-loops-to-gpu"> {
+  let summary = "Convert mapped scf.parallel ops within GPU regions to gpu launch operations";
+  let description = [{
+    Convert scf.parallel ops within GPU regions to gpu launch operations.
+  }];
+  let constructor = "::imex::createConvertRegionParallelLoopToGpuPass()";
+  let dependentDialects = [];
+  let options = [];
+}
 
 //===----------------------------------------------------------------------===//
 // GPUToSPIRV

diff --git a/include/imex/Conversion/RegionParallelLoopToGpu/CMakeLists.txt b/include/imex/Conversion/RegionParallelLoopToGpu/CMakeLists.txt
diff --git a/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h b/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h
@@ -0,0 +1,34 @@
+//===- RegionParallelLoopToGpu.h -------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Adds the conversion pattern from `scf.parallel` within `region.env_region`
+/// to `gpu.launch`.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _RegionParallelLoopToGpu_H_INCLUDED_
+#define _RegionParallelLoopToGpu_H_INCLUDED_
+
+#include <mlir/IR/PatternMatch.h>
+
+namespace mlir {
+class Pass;
+} // namespace mlir
+
+namespace imex {
+#define GEN_PASS_DECL_CONVERTREGIONPARALLELLOOPTOGPU
+#include "imex/Conversion/Passes.h.inc"
+
+/// Create a pass to convert the Region dialect to the GPU dialect.
+std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass();
+
+} // namespace imex
+
+#endif // _RegionParallelLoopToGpu_H_INCLUDED_
diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h
@@ -25,6 +25,7 @@ namespace imex {
 std::unique_ptr<mlir::Pass> createSerializeSPIRVPass();
 std::unique_ptr<mlir::Pass>
 createInsertGPUAllocsPass(const char *clientAPI = "vulkan");
+std::unique_ptr<mlir::Pass> createInsertGPUCopyPass();
 std::unique_ptr<mlir::Pass> createSetSPIRVCapabilitiesPass();
 std::unique_ptr<mlir::Pass>
 createSetSPIRVAbiAttributePass(const char *clientAPI = "vulkan");
@@ -40,6 +41,7 @@ createOptimizeTransposePass(const std::string &device = "pvc");
 std::unique_ptr<mlir::Pass> createHoistTransposePass();
 std::unique_ptr<mlir::Pass> createVnniTransformationPass();
 std::unique_ptr<mlir::Pass> createEmulateNonNativeBF16Pass();
+std::unique_ptr<mlir::Pass> createTileLoopsPass();
 
 #define GEN_PASS_DECL
 #include "imex/Transforms/Passes.h.inc"

diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td
@@ -41,10 +41,20 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> {
     Option<"clientAPI", "client-api", "std::string", /*default=*/"\"opencl\"",
            "The client API to use for inserting gpu allocs">,
     Option<"inRegions", "in-regions", "bool", "false",
-           "Add gpu allocs only for memref.AllocOps within GPU regions">
+           "Add gpu allocs only for memref.AllocOps within GPU regions">,
+    Option<"hostShared", "host-shared", "bool", "true",
+           "If set, allocate shared memory accessible both on both host and device.">
   ];
 }
 
+def InsertGPUCopy : Pass<"insert-gpu-copy", "::mlir::func::FuncOp"> {
+  let summary = "Converts memref.copy op to gpu.memcpy if within an env region.";
+  let constructor = "imex::createInsertGPUCopyPass()";
+  let dependentDialects = ["::mlir::memref::MemRefDialect",
+                           "::mlir::gpu::GPUDialect",
+                           "::mlir::arith::ArithDialect"];
+}
+
 def SetSPIRVCapabilities : Pass<"set-spirv-capabilities"> {
   let summary = "Sets Spirv capabilities";
   let constructor = "imex::createSetSPIRVCapabilitiesPass()";
@@ -200,4 +210,28 @@ def HoistTranspose : Pass<"imex-xegpu-hoist-transpose"> {
   ];
 }
 
+def TileLoops : Pass<"tile-loops", "::mlir::func::FuncOp"> {
+  let summary = "Tile linalg.generic loops for GPU offloading";
+  let description = [{
+    Tiles loops defined with tensor inputs/outputs using the given tile sizes.
+    This pass should be applied after loop fusion and before bufferization.
+    Uses `TileUsingSCF` method. To map the loop to GPU blocks and threads this
+    pass should be called twice. If `in-regions` is set, only loops within GPU
+    regions are tiled.
+  }];
+  let options = [
+    ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes">,
+    Option<"minTileFactor", "min-tile-factor", "int64_t", "2",
+           "Minimum factor between dimension size and a tile size">,
+    Option<"inRegions", "in-regions", "bool", "false",
+           "Convert loops only within GPU regions">
+  ];
+  let constructor = "imex::createTileLoopsPass()";
+  let dependentDialects = [
+    "::mlir::linalg::LinalgDialect",
+    "::mlir::scf::SCFDialect"
+  ];
+}
+
+
 #endif // _IMEX_TRANSFORMS_PASSES_TD_INCLUDED_
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(ArithToVC)
 add_subdirectory(DistToStandard)
 add_subdirectory(NDArrayToLinalg)
 add_subdirectory(DropRegions)
+add_subdirectory(RegionParallelLoopToGpu)
 add_subdirectory(GPUToSPIRV)
 add_subdirectory(GPUToGPUX)
 add_subdirectory(GPUXToLLVM)

diff --git a/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp b/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp
@@ -526,12 +526,15 @@ struct CopyLowering
         ::mlir::MemRefType::get(tTyp.getShape(), tTyp.getElementType());
     auto mr = rewriter.create<::mlir::memref::AllocOp>(
         loc, mrTyp, dynDims, rewriter.getI64IntegerAttr(8));
-    // and copy if non-0
+    // and copy if not zero sized
     if (!retArTyp.hasZeroSize()) {
       auto srcMR =
           createToMemRef(loc, rewriter, src, srcArTyp.getMemRefType(src));
-      // create a region with given env, add copy op within it
-      auto env = rewriter.getStringAttr("protect_copy_op");
+      // wrap copy in a region to mark it non-deletable or a gpu copy
+      bool hasGPUEnv = ::imex::ndarray::hasGPUEnv(srcArTyp) ||
+                       ::imex::ndarray::hasGPUEnv(retArTyp);
+      std::string regName = hasGPUEnv ? "gpu_copy_op" : "protect_copy_op";
+      auto env = rewriter.getStringAttr(regName);
       rewriter.create<::imex::region::EnvironmentRegionOp>(
           loc, env, std::nullopt, std::nullopt,
           [&srcMR, &mr](::mlir::OpBuilder &builder, ::mlir::Location loc) {

diff --git a/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt b/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_imex_conversion_library(IMEXSCFToGPU
+  RegionParallelLoopToGpu.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/imex/Conversion/SCFToGPU
+
+  DEPENDS
+  IMEXConversionPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRSCFToGPU
+)
diff --git a/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp b/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp
@@ -0,0 +1,69 @@
+//===- RegionParallelLoopToGpu.cpp -  --------------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file extends upstream ParallelLoopToGpuPass by applying the transform
+/// only if the parallel loop is within a GPU region
+/// (`region.env_region #region.gpu_env<...>`).
+///
+//===----------------------------------------------------------------------===//
+
+#include <imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h>
+#include <imex/Dialect/Region/RegionUtils.h>
+#include <mlir/Conversion/SCFToGPU/SCFToGPU.h>
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Pass/Pass.h>
+#include <mlir/Transforms/DialectConversion.h>
+
+namespace imex {
+#define GEN_PASS_DEF_CONVERTREGIONPARALLELLOOPTOGPU
+#include "imex/Conversion/Passes.h.inc"
+
+namespace {
+struct ConvertRegionParallelLoopToGpuPass
+    : public ::imex::impl::ConvertRegionParallelLoopToGpuBase<
+          ConvertRegionParallelLoopToGpuPass> {
+  ConvertRegionParallelLoopToGpuPass() = default;
+
+  void runOnOperation() override {
+    ::mlir::RewritePatternSet patterns(&getContext());
+    ::mlir::populateParallelLoopToGPUPatterns(patterns);
+    ::mlir::ConversionTarget target(getContext());
+    target.markUnknownOpDynamicallyLegal(
+        [](::mlir::Operation *) { return true; });
+    ::mlir::configureParallelLoopToGPULegality(target);
+
+    // collect all gpu regions
+    ::mlir::SmallVector<::mlir::Operation *> ops;
+    getOperation()->walk([&](::imex::region::EnvironmentRegionOp op,
+                             const ::mlir::WalkStage &stage) {
+      if (::imex::region::isGpuRegion(op)) {
+        ops.push_back(op);
+        return ::mlir::WalkResult::skip();
+      }
+      return ::mlir::WalkResult::advance();
+    });
+
+    // apply par-loop to gpu conversion to collected gpu regions
+    if (::mlir::failed(
+            ::mlir::applyPartialConversion(ops, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+    ::mlir::finalizeParallelLoopToGPUConversion(getOperation());
+  }
+};
+} // namespace
+
+/// Create a pass to convert the Region dialect to the GPU dialect.
+std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass() {
+  return std::make_unique<ConvertRegionParallelLoopToGpuPass>();
+}
+
+} // namespace imex
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_library(IMEXTransforms
   CastIndex.cpp
   EmulateNonNativeBF16.cpp
   InsertGPUAllocs.cpp
+  InsertGPUCopy.cpp
   LowerMemRefCopy.cpp
   RemoveSingleElemVector.cpp
   RemoveTemporaries.cpp
@@ -14,6 +15,7 @@ add_mlir_library(IMEXTransforms
   VnniTransformation.cpp
   OptimizeTranspose.cpp
   HoistTranspose.cpp
+  TileLoops.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/imex/Transforms

diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp
@@ -105,7 +105,7 @@ class InsertGPUAllocsPass final
         auto allocResult = builder.create<::mlir::gpu::AllocOp>(
             alloc.getLoc(), alloc.getType(), /*asyncToken*/ nullptr,
             /*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
-            alloc.getSymbolOperands(), true);
+            alloc.getSymbolOperands(), /*hostShared*/ hostShared);
         alloc.replaceAllUsesWith(allocResult);
         alloc.erase();
       }

diff --git a/lib/Transforms/InsertGPUCopy.cpp b/lib/Transforms/InsertGPUCopy.cpp
@@ -0,0 +1,89 @@
+//===- InsertGPUCopy.cpp - InsertGPUCopy Pass  -------*- C++ -*-===//
+//
+// Copyright 2022 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file replaces the memref.copy ops with gpu.memcpy ops if the
+/// memref.copy resides in an environment region. This environment region must
+/// be created in a prior pass where the device/host memory semantics are
+/// present.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Threading.h"
+#include <imex/Transforms/Passes.h>
+
+#include <imex/Dialect/Region/RegionUtils.h>
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/GPU/Transforms/Passes.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Pass/Pass.h>
+#include <optional>
+
+namespace imex {
+#define GEN_PASS_DEF_INSERTGPUCOPY
+#include "imex/Transforms/Passes.h.inc"
+} // namespace imex
+
+namespace {
+
+inline bool isInEnvRegion(::mlir::Operation *op) {
+  if (!op)
+    return false;
+  if (!op->getParentOfType<::imex::region::EnvironmentRegionOp>())
+    return false;
+  return true;
+}
+
+class InsertGPUCopyPass final
+    : public imex::impl::InsertGPUCopyBase<InsertGPUCopyPass> {
+public:
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto &funcBody = func.getBody();
+    if (funcBody.empty()) {
+      return;
+    } else if (!llvm::hasSingleElement(funcBody)) {
+      func.emitError("Function must have exactly one block");
+      signalPassFailure();
+      return;
+    }
+
+    mlir::OpBuilder builder(func);
+    // collect copy ops in GPU regions
+    ::mlir::SmallVector<::mlir::memref::CopyOp> copyOpsInGpuRegion;
+
+    // traverse ops and identify memref.copy ops which are in GPU region
+    (void)func.walk([&](::mlir::memref::CopyOp op) {
+      if (isInEnvRegion(op)) {
+        copyOpsInGpuRegion.emplace_back(op);
+      }
+    });
+
+    // Replace copy ops with gpu.memcpy
+    for (auto copyOp : copyOpsInGpuRegion) {
+      builder.setInsertionPoint(copyOp);
+      // /*asyncToken*/ std::nullopt,
+      builder.create<::mlir::gpu::MemcpyOp>(
+          copyOp.getLoc(), /*resultTypes*/ ::mlir::TypeRange{},
+          /*asyncDependencies*/ ::mlir::ValueRange{}, copyOp.getTarget(),
+          copyOp.getSource());
+      copyOp.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace imex {
+std::unique_ptr<mlir::Pass> createInsertGPUCopyPass() {
+  return std::make_unique<InsertGPUCopyPass>();
+}
+} // namespace imex