[MHAL] Update MHAL to use gpu target attrs: SerializeToHSA to TargetAttr (2/3)

fabianmcg · fabianmcg · commit 28a4165462b6 · 2024-07-15T11:19:05.000-05:00
This patch updates the MHAL project to use the `gpu` target attributes
infrastructure. This is a patch in part of the series switching the compilation
infrastructure from SerializeToHSA to `gpu` target attributes.

This includes:
- Updating `PrefillPass` to work on GPU binaries.
- Updating `PackageTargetsPass` to work on GPU binaries.
- Updating `MHALToGPU` to create `gpu.binary` operations.
- Updating `mhal::TargetObject` to store an attribute instead of a string.
- Add the `DropMetadata` pass:
This pass drops all metadata from GPU binaries -e.g. property dictionary and
kernel metadata. This is required to avoid unregistered dialect errors
for attributes stored in the metadata like `mhal.prefill` but not parsable
by tools like `mlir-cpu-runner`.
diff --git a/external/mlir-hal/include/mlir/Dialect/MHAL/IR/MHALAttrDefs.td b/external/mlir-hal/include/mlir/Dialect/MHAL/IR/MHALAttrDefs.td
@@ -40,7 +40,7 @@ def MHAL_TargetObjectAttr : MHAL_Attr<"TargetObject"> {
       AttrParameter<"::mlir::mhal::TargetObjectType", "The target object type">:$type,
       StringRefParameter<"The architecture target">:$arch,
       AttrParameter<"DictionaryAttr", "The object type">:$attributes,
-      StringRefParameter<"The object binary">:$binary
+      AttrParameter<"Attribute", "The object binary">:$binary
     );
 
     let genVerifyDecl = 0;
diff --git a/external/mlir-hal/include/mlir/Dialect/MHAL/Transforms/Passes.h b/external/mlir-hal/include/mlir/Dialect/MHAL/Transforms/Passes.h
@@ -33,6 +33,7 @@ namespace mhal {
 #define GEN_PASS_DECL_MHALSELECTTARGETSPASS
 #define GEN_PASS_DECL_MHALBUFFERIZEPASS
 #define GEN_PASS_DECL_MHALPREFILLPASS
+#define GEN_PASS_DECL_MHALDROPBINARYMETADATAPASS
 
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/MHAL/Transforms/Passes.h.inc"
diff --git a/external/mlir-hal/include/mlir/Dialect/MHAL/Transforms/Passes.td b/external/mlir-hal/include/mlir/Dialect/MHAL/Transforms/Passes.td
@@ -46,4 +46,8 @@ def MHALPrefillPass : Pass<"mhal-prefill", "func::FuncOp"> {
   let dependentDialects = ["mhal::MHALDialect"];
 }
 
+def MHALDropBinaryMetadataPass : Pass<"mhal-drop-binary-metadata", "ModuleOp"> {
+  let summary = "drops all metadata stored in GPU binaries";
+}
+
 #endif // MLIR_DIALECT_MHAL_PASSES
diff --git a/external/mlir-hal/lib/Conversion/MHALToGPU/MHALToGPU.cpp b/external/mlir-hal/lib/Conversion/MHALToGPU/MHALToGPU.cpp
@@ -172,7 +172,6 @@ struct LaunchRewritePattern : public OpRewritePattern<mhal::LaunchOp> {
     if (!kernelPkg.has_value())
       return rw.notifyMatchFailure(op, "no gpu target");
 
-    auto arch = kernelPkg->getTarget();
     auto targetObj = kernelPkg->getObject();
     auto binary = targetObj.getBinary();
     auto launchDims = kernelPkg->getLaunchDims();
@@ -184,46 +183,20 @@ struct LaunchRewritePattern : public OpRewritePattern<mhal::LaunchOp> {
     auto func = *getCalledFunc(op);
     Location floc = func.getLoc();
 
-    // 2. create dummy gpu.module for reference from gpu.launch_func
-    //    - with gpu.binary, arch attributes
-    //    - and gpu.func (referenced by gpu.launch_func
-    //    gpu.module @<func_name>_module attributes {arch = "gfx908", gpu.binary
-    //        = "\7FELF\..."} {
-    //      gpu.func @<func_name> (...) attributes {block_size = 256 : i32,
-    //          grid_size = 900 : i32, gpu.kernel}
+    // 2. re-materialize gpu.binary @<func_name>_module [#gpu.object<...>]
 
     FunctionOpInterface funcIF(func);
     auto funcName = funcIF.getName();
-    auto gpuModuleName = funcName + "_module";
+    auto binaryName = funcName + "_module";
 
-    auto gpuModule = module.lookupSymbol<gpu::GPUModuleOp>(gpuModuleName.str());
-    if (!gpuModule) {
+    auto binaryOp = module.lookupSymbol<gpu::BinaryOp>(binaryName.str());
+    if (!binaryOp) {
       OpBuilder b(ctx);
-      gpuModule = b.create<gpu::GPUModuleOp>(floc, gpuModuleName.str());
-      gpuModule->setAttr("arch", b.getStringAttr(arch));
-      gpuModule->setAttr("gpu.binary", b.getStringAttr(binary));
+      binaryOp = b.create<gpu::BinaryOp>(floc, binaryName.str(), nullptr,
+                                         ArrayRef<Attribute>({binary}));
 
       SymbolTable symbolTable(module);
-      symbolTable.insert(gpuModule);
-    }
-
-    auto gpuFunc = gpuModule.lookupSymbol<gpu::GPUFuncOp>(funcName);
-    if (!gpuFunc) {
-      OpBuilder b(gpuModule.getContext());
-      gpuFunc =
-          b.create<gpu::GPUFuncOp>(floc, funcName, func.getFunctionType());
-      gpuFunc->setAttr("block_size", b.getI32IntegerAttr(blockSize));
-      gpuFunc->setAttr("grid_size", b.getI32IntegerAttr(gridSize));
-      gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
-                       b.getUnitAttr());
-
-      SymbolTable symbolTable(gpuModule);
-      symbolTable.insert(gpuFunc);
-
-      // Must have a return
-      auto block = &gpuFunc.front();
-      b.setInsertionPoint(block, block->begin());
-      b.create<gpu::ReturnOp>(floc, ValueRange{});
+      symbolTable.insert(binaryOp);
     }
 
     // 3. create substitute gpu.launch_func
@@ -281,9 +254,12 @@ struct LaunchRewritePattern : public OpRewritePattern<mhal::LaunchOp> {
 
     // Make gpu.launch_func
     auto gpuLaunchOp = rw.create<gpu::LaunchFuncOp>(
-        loc, gpuFunc, gpu::KernelDim3{gridSizeIdx, oneIdx, oneIdx},
+        loc,
+        SymbolRefAttr::get(getContext(), binaryName.str(),
+                           {FlatSymbolRefAttr::get(getContext(), funcName)}),
+        gpu::KernelDim3{gridSizeIdx, oneIdx, oneIdx},
         gpu::KernelDim3{blockSizeIdx, oneIdx, oneIdx}, dynamicSharedMemorySize,
-        gpuOperands, tokenType, asyncDeps);
+        gpuOperands, tokenType, ValueRange(asyncDeps));
     Value token = gpuLaunchOp->getResult(0);
 
     // Insert gpu.memcpy for results
diff --git a/external/mlir-hal/lib/Dialect/MHAL/IR/MHAL.cpp b/external/mlir-hal/lib/Dialect/MHAL/IR/MHAL.cpp
@@ -99,8 +99,8 @@ mlir::Attribute TargetObjectAttr::parse(mlir::AsmParser &parser,
     return {};
   }
 
-  std::string binary;
-  if (parser.parseKeywordOrString(&binary)) {
+  Attribute binary;
+  if (parser.parseAttribute(binary)) {
     return {};
   }
 
@@ -129,7 +129,7 @@ void TargetObjectAttr::print(mlir::AsmPrinter &printer) const {
 
   // print binary
   printer << " -> ";
-  printer.printKeywordOrString(getBinary());
+  printer << getBinary();
   printer << ">";
 }
 
diff --git a/external/mlir-hal/lib/Dialect/MHAL/Pipelines/Pipelines.cpp b/external/mlir-hal/lib/Dialect/MHAL/Pipelines/Pipelines.cpp
@@ -150,6 +150,7 @@ void mhal::buildRunnerPipeline(OpPassManager &pm,
   GpuToLLVMConversionPassOptions opts;
   opts.kernelBarePtrCallConv = options.barePtrMemrefs;
   pm.addPass(createGpuToLLVMConversionPass(opts));
+  pm.addPass(createMHALDropBinaryMetadataPass());
 
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/external/mlir-hal/lib/Dialect/MHAL/Transforms/CMakeLists.txt b/external/mlir-hal/lib/Dialect/MHAL/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRMHALTransforms
   Bufferize.cpp
   BufferizableOpInterfaceImpl.cpp
+  DropMetadata.cpp
   InferGraph.cpp
   PackageTargets.cpp
   SelectTargets.cpp
@@ -26,4 +27,3 @@ add_mlir_dialect_library(MLIRMHALTransforms
   MLIRSupport
   MLIRTransformUtils
 )
-
diff --git a/external/mlir-hal/lib/Dialect/MHAL/Transforms/DropMetadata.cpp b/external/mlir-hal/lib/Dialect/MHAL/Transforms/DropMetadata.cpp
@@ -0,0 +1,45 @@
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MHAL/Transforms/Passes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+
+namespace mlir {
+namespace mhal {
+#define GEN_PASS_DEF_MHALDROPBINARYMETADATAPASS
+#include "mlir/Dialect/MHAL/Transforms/Passes.h.inc"
+} // namespace mhal
+} // namespace mlir
+
+#define DEBUG_TYPE "mhal-prefill"
+
+using namespace mlir;
+
+namespace {
+class MHALDropBinaryMetadataPass
+    : public mhal::impl::MHALDropBinaryMetadataPassBase<
+          MHALDropBinaryMetadataPass> {
+public:
+  // Inspect each gpu::BinaryOp and drop all the metadata.
+  void runOnOperation() override;
+};
+} // namespace
+
+// Inspect each gpu::BinaryOp and drop all the metadata.
+void MHALDropBinaryMetadataPass::runOnOperation() {
+  Builder b(&getContext());
+  for (gpu::BinaryOp binary :
+       getOperation().getBody()->getOps<gpu::BinaryOp>()) {
+    // Drop all discardable attributes.
+    binary->setDiscardableAttrs(b.getDictionaryAttr({}));
+    SmallVector<Attribute, 10> objects;
+    for (auto objRaw : binary.getObjects()) {
+      auto object = cast<gpu::ObjectAttr>(objRaw);
+      // Drop the property dictionary.
+      objects.push_back(
+          b.getAttr<gpu::ObjectAttr>(object.getTarget(), object.getFormat(),
+                                     object.getObject(), nullptr, nullptr));
+    }
+    binary.setObjectsAttr(b.getArrayAttr(objects));
+  }
+}
diff --git a/external/mlir-hal/lib/Dialect/MHAL/Transforms/PackageTargets.cpp b/external/mlir-hal/lib/Dialect/MHAL/Transforms/PackageTargets.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/MHAL/IR/MHAL.h"
 #include "mlir/Dialect/MHAL/Transforms/Passes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -41,7 +42,6 @@ namespace mhal {
 using namespace mlir;
 
 namespace {
-
 struct MHALPackageTargetsPass
     : public mhal::impl::MHALPackageTargetsPassBase<MHALPackageTargetsPass> {
 
@@ -55,36 +55,30 @@ struct MHALPackageTargetsPass
 
     mod->walk([&](ModuleOp kernelMod) {
       if (kernelMod->hasAttr("mhal.module")) {
-        SmallVector<gpu::GPUModuleOp, 8> gpuMods;
-        kernelMod->walk([&](gpu::GPUModuleOp gpuMod) {
-          auto binaryAttr = gpuMod->getAttrOfType<StringAttr>(
-              gpu::getDefaultGpuBinaryAnnotation());
-          if (!binaryAttr) {
-            gpuMod.emitOpError() << "missing gpu.binary attribute";
-            return;
-          }
-
-          gpuMods.push_back(gpuMod);
-
+        SmallVector<gpu::BinaryOp, 8> binaries;
+        kernelMod->walk([&](gpu::BinaryOp binary) {
+          auto object = cast<gpu::ObjectAttr>(binary.getObjects()[0]);
+          binaries.push_back(binary);
+          gpu::KernelTableAttr metadata = object.getKernels();
+          assert(metadata && "expected a valid metadata attribute");
           // apply target spec to original func
-          gpuMod.walk([&](LLVM::LLVMFuncOp func) {
-            if (auto attr =
-                    func->getAttrOfType<SymbolRefAttr>("original_func")) {
+          for (auto [name, kernel] : metadata) {
+            if (auto attr = kernel.getAttr<SymbolRefAttr>("original_func")) {
               if (auto kernelFunc = mod.lookupSymbol<func::FuncOp>(attr)) {
                 auto archName =
                     kernelMod->getAttrOfType<StringAttr>("mhal.arch")
                         .getValue();
                 auto funcName = attr.getLeafReference().getValue();
                 uint32_t gridSize =
-                    func->getAttrOfType<IntegerAttr>("grid_size").getInt();
+                    kernel.getAttr<IntegerAttr>("grid_size").getInt();
                 uint32_t blockSize =
-                    func->getAttrOfType<IntegerAttr>("block_size").getInt();
+                    kernel.getAttr<IntegerAttr>("block_size").getInt();
 
                 DictionaryAttr objAttrs;
 
                 auto xobj = mhal::TargetObjectAttr::get(
                     b.getContext(), mhal::TargetObjectType::ELF, archName,
-                    objAttrs, binaryAttr);
+                    objAttrs, object);
 
                 DictionaryAttr pkgAttrs;
                 // = b.getDictionaryAttr({
@@ -97,12 +91,12 @@ struct MHALPackageTargetsPass
                 kernelImpls[kernelFunc].push_back(xpkg);
               }
             }
-          });
+          }
         });
 
         // clean processed gpu.modules
-        for (auto gpuMod : gpuMods) {
-          gpuMod.erase();
+        for (auto binary : binaries) {
+          binary.erase();
         }
 
         // remove __kernel_*
diff --git a/external/mlir-hal/lib/Dialect/MHAL/Transforms/Prefill.cpp b/external/mlir-hal/lib/Dialect/MHAL/Transforms/Prefill.cpp
@@ -48,14 +48,17 @@ void MHALPrefillPass::insertPrefillOps(OpBuilder &builder,
                                        gpu::LaunchFuncOp &launchOp) {
   auto func = cast<func::FuncOp>(launchOp->getParentOp());
   auto module = cast<ModuleOp>(func->getParentOp());
-  auto kernel = launchOp.getKernel();
-  auto *callee = module.lookupSymbol(kernel);
-  assert(callee != nullptr && "expect to find the function defenition");
-  auto llvmFunc = cast<LLVM::LLVMFuncOp>(callee);
-  auto gpuModule = cast<gpu::GPUModuleOp>(llvmFunc->getParentOp());
-
+  auto binaryName = launchOp.getKernelModuleName();
+  auto binary = module.lookupSymbol<gpu::BinaryOp>(binaryName);
+  assert(binary != nullptr && "expect to find the function defenition");
+  auto objects = binary.getObjects().getValue();
+  assert(objects.size() == 1 && "expected a single object");
   SmallVector<mhal::PrefillAttr, 4> prefillAttrs;
-  if (auto moduleAttr = gpuModule->getAttr(llvmFunc.getSymName())) {
+  auto object = cast<gpu::ObjectAttr>(objects[0]);
+  DictionaryAttr objectProps = object.getProperties();
+  if (!objectProps)
+    return;
+  if (auto moduleAttr = objectProps.get(launchOp.getKernelName())) {
     if (auto arrayAttr = dyn_cast<ArrayAttr>(moduleAttr)) {
       for (auto attr : arrayAttr) {
         if (auto prefillAttr = dyn_cast<mhal::PrefillAttr>(attr)) {

Original file line number	Diff line number	Diff line change
`@@ -46,4 +46,8 @@ def MHALPrefillPass : Pass<"mhal-prefill", "func::FuncOp"> {`
`46`	`46`	`let dependentDialects = ["mhal::MHALDialect"];`
`47`	`47`	`}`
`48`	`48`
	`49`	`+def MHALDropBinaryMetadataPass : Pass<"mhal-drop-binary-metadata", "ModuleOp"> {`
	`50`	`+ let summary = "drops all metadata stored in GPU binaries";`
	`51`	`+}`
	`52`	`+`
`49`	`53`	`#endif // MLIR_DIALECT_MHAL_PASSES`
Original file line number	Diff line number	Diff line change
`@@ -99,8 +99,8 @@ mlir::Attribute TargetObjectAttr::parse(mlir::AsmParser &parser,`
`99`	`99`	`return {};`
`100`	`100`	`}`
`101`	`101`
`102`		`- std::string binary;`
`103`		`- if (parser.parseKeywordOrString(&binary)) {`
	`102`	`+ Attribute binary;`
	`103`	`+ if (parser.parseAttribute(binary)) {`
`104`	`104`	`return {};`
`105`	`105`	`}`
`106`	`106`
`@@ -129,7 +129,7 @@ void TargetObjectAttr::print(mlir::AsmPrinter &printer) const {`
`129`	`129`
`130`	`130`	`// print binary`
`131`	`131`	`printer << " -> ";`
`132`		`- printer.printKeywordOrString(getBinary());`
	`132`	`+ printer << getBinary();`
`133`	`133`	`printer << ">";`
`134`	`134`	`}`
`135`	`135`