diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td index 652ee9f60..e1eed0b50 100644 --- a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td +++ b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td @@ -112,7 +112,7 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe shape.size() == strides.size() ) ); - assert(offsets.size() == dynamicDims); + assert((std::ptrdiff_t)offsets.size() == dynamicDims); $_state.addOperands(source); $_state.addOperands(offsets); diff --git a/include/imex/Utils/DebugUtils.h b/include/imex/Utils/DebugUtils.h index 9c98809d3..3041c4f81 100644 --- a/include/imex/Utils/DebugUtils.h +++ b/include/imex/Utils/DebugUtils.h @@ -8,7 +8,8 @@ #include #include -static std::string getValueAsString(mlir::Value op, bool asOperand = false) { +[[maybe_unused]] static std::string getValueAsString(mlir::Value op, + bool asOperand = false) { std::string buf; buf.clear(); llvm::raw_string_ostream os(buf); diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp index 9b044065d..af655bc6c 100644 --- a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp +++ b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp @@ -1484,7 +1484,7 @@ unsigned getElementPerWI(imex::xegpu::TensorDescType tDescType) { auto wiLayout = sgMap.getWiLayout(); auto wiData = sgMap.getWiData(); unsigned elemPerWI = 1; - for (size_t i = 0; i < wiData.size(); i++) { + for (int64_t i = 0; i < wiData.size(); i++) { if (wiData[i] != 1) llvm_unreachable("wi_data must be 1 for all dimension for " "JointMatrix lowering"); diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp index 79e5ae9f1..ce6b35475 100644 --- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp @@ -108,7 +108,7 @@ struct SgPrefetchTileOpPattern return mlir::failure(); auto shape = tileTy.getShape(); - if (shape[0] * shape[1] != tiles.size()) { + if (shape[0] * shape[1] != (int64_t)tiles.size()) { op.emitOpError("Failed to lower LoadTileOp because shape[0] * shape[1] " "!= sources.size()."); return mlir::failure(); @@ -153,7 +153,7 @@ struct SgLoadTileOpPattern auto shape = resultTy.getShape(); auto sources = adaptor.getSource(); - if (shape[0] * shape[1] != sources.size()) { + if (shape[0] * shape[1] != (int64_t)sources.size()) { op.emitOpError("Failed to lower LoadTileOp because shape[0] * shape[1] " "!= sources.size()."); return mlir::failure(); diff --git a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 6756f3bc9..7c22c7d67 100644 --- a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -237,7 +237,8 @@ static bool verifyAndInferShape(std::vector &shape, auto sgData = wgMap.getSgData(); auto sgLayout = wgMap.getSgLayout(); - if (shape.size() != sgData.size() || shape.size() != sgLayout.size()) + if ((int64_t)shape.size() != sgData.size() || + (int64_t)shape.size() != sgLayout.size()) return false; for (size_t i = 0; i < shape.size(); i++) { @@ -254,11 +255,12 @@ static bool verifyAndInferShape(std::vector &shape, auto wiLayout = sgMap.getWiLayout(); auto wiData = sgMap.getWiData(); - if (blockSize && shape.size() != blockSize.size()) { + if (blockSize && (int64_t)shape.size() != blockSize.size()) { return false; } - if (shape.size() != wiData.size() || shape.size() != wiLayout.size()) { + if ((int64_t)shape.size() != wiData.size() || + (int64_t)shape.size() != wiLayout.size()) { return false; } @@ -840,7 +842,7 @@ ::mlir::ParseResult StoreNDOp::parse(::mlir::OpAsmParser &parser, void StoreNDOp::print(::mlir::OpAsmPrinter &printer) { auto mode = getMode(); - bool printSep = false; + [[maybe_unused]] bool printSep = false; auto printDefaults = printDefaultValues(); auto numAttrs = (*this)->getAttrs().size(); @@ -997,7 +999,7 @@ ::mlir::ParseResult PrefetchNDOp::parse(::mlir::OpAsmParser &parser, void PrefetchNDOp::print(::mlir::OpAsmPrinter &printer) { auto mode = getMode(); - bool printSep = false; + [[maybe_unused]] bool printSep = false; auto printDefaults = printDefaultValues(); auto numAttrs = (*this)->getAttrs().size(); printer << ' '; @@ -1537,7 +1539,7 @@ ::mlir::LogicalResult UpdateOffsetOp::verify() { ::mlir::LogicalResult UpdateNDOffsetOp::verify() { // number of offsets specified must match the rank of the tensor descriptor - if (getTensorDesc().getType().getRank() != getOffsets().size()) { + if (getTensorDesc().getType().getRank() != (int64_t)getOffsets().size()) { return emitOpError("Invalid number of offsets."); } return ::mlir::success(); diff --git a/lib/Transforms/BF16ToGPU.cpp b/lib/Transforms/BF16ToGPU.cpp index fa8bd52a5..85aadd287 100644 --- a/lib/Transforms/BF16ToGPU.cpp +++ b/lib/Transforms/BF16ToGPU.cpp @@ -34,140 +34,136 @@ struct BF16ToGPUPass : public BF16ToGPUBase { auto mod = getOperation(); SymbolTable symbolTable(mod); mlir::OpBuilder builder(mod); - auto &aliases = getAnalysis(); // Part 1: gpu::GPUFuncOp - WalkResult result1 = - mod.walk([&](gpu::GPUFuncOp op) -> WalkResult { - // 1-1: Create new FunctionType and replace old FunctionType - auto oftype = op.getFunctionType(); - llvm::SmallVector argTypes; - ArrayRef inputTypes; - ArrayRef resultTypes; - for (Type t : oftype.getInputs()) { - MemRefType m = t.dyn_cast(); - if (m) { - Type et = m.getElementType(); - if (et.isBF16()) { - if (m.hasStaticShape()) { - llvm::ArrayRef s = m.getShape(); - auto i = MemRefType::get(s, builder.getI16Type()); - argTypes.push_back(i); - } else { - // TODO: Support dynamic shape - op.emitError( - "Non static shape bf16 MemRefType in GPUFuncOp inputs"); - } - } else { - argTypes.push_back(t); - } - } else if (t.isBF16()) { - argTypes.push_back(builder.getI16Type()); + (void)mod.walk([&](gpu::GPUFuncOp op) -> WalkResult { + // 1-1: Create new FunctionType and replace old FunctionType + auto oftype = op.getFunctionType(); + llvm::SmallVector argTypes; + ArrayRef inputTypes; + ArrayRef resultTypes; + for (Type t : oftype.getInputs()) { + MemRefType m = t.dyn_cast(); + if (m) { + Type et = m.getElementType(); + if (et.isBF16()) { + if (m.hasStaticShape()) { + llvm::ArrayRef s = m.getShape(); + auto i = MemRefType::get(s, builder.getI16Type()); + argTypes.push_back(i); } else { - argTypes.push_back(t); + // TODO: Support dynamic shape + op.emitError( + "Non static shape bf16 MemRefType in GPUFuncOp inputs"); } + } else { + argTypes.push_back(t); } - auto nftype = - dyn_cast(op.cloneTypeWith(argTypes, resultTypes)); - op.setFunctionType(nftype); + } else if (t.isBF16()) { + argTypes.push_back(builder.getI16Type()); + } else { + argTypes.push_back(t); + } + } + auto nftype = + dyn_cast(op.cloneTypeWith(argTypes, resultTypes)); + op.setFunctionType(nftype); - // 1-2: Collect ops that need bf16 widening and widen those ops - // Most ops in arith and math dialect that has bf16 operand will - // be widened to use f32 operand - SmallVector widenOps; - WalkResult result1_1 = op.getRegion().walk( - [&](Operation *lop) -> WalkResult { - auto oname = lop->getName().getStringRef(); - if (oname.startswith("arith.") || oname.startswith("math.")) { - // Skip bitcast operation as we cannot change width of operand - if (!oname.startswith("arith.bitcast")) { - bool needWidening = false; - for (const auto &oper : lop->getOperands()) { - if (oper.getType().isBF16()) { - needWidening = true; - } - } - if (needWidening) { - widenOps.push_back(lop); - } + // 1-2: Collect ops that need bf16 widening and widen those ops + // Most ops in arith and math dialect that has bf16 operand will + // be widened to use f32 operand + SmallVector widenOps; + (void)op.getRegion().walk( + [&](Operation *lop) -> WalkResult { + auto oname = lop->getName().getStringRef(); + if (oname.startswith("arith.") || oname.startswith("math.")) { + // Skip bitcast operation as we cannot change width of operand + if (!oname.startswith("arith.bitcast")) { + bool needWidening = false; + for (const auto &oper : lop->getOperands()) { + if (oper.getType().isBF16()) { + needWidening = true; } } - return WalkResult::advance(); - }); - for (Operation *o : widenOps) { - builder.setInsertionPoint(o); - unsigned int idx = 0; - for (const auto &oper : o->getOperands()) { - if (oper.getType().isBF16()) { - auto newOp = builder.create( - o->getLoc(), builder.getF32Type(), oper); - o->setOperand(idx, newOp); - } - idx++; - } - for (mlir::OpResult res : o->getResults()) { - if (res.getType().isBF16()) { - res.setType(builder.getF32Type()); - builder.setInsertionPointAfter(o); - auto newRes = builder.create( - o->getLoc(), builder.getBF16Type(), res); - res.replaceAllUsesExcept(newRes, newRes); + if (needWidening) { + widenOps.push_back(lop); + } } } + return WalkResult::advance(); + }); + for (Operation *o : widenOps) { + builder.setInsertionPoint(o); + unsigned int idx = 0; + for (const auto &oper : o->getOperands()) { + if (oper.getType().isBF16()) { + auto newOp = builder.create( + o->getLoc(), builder.getF32Type(), oper); + o->setOperand(idx, newOp); } - // 1-3: Change element type of entry block arguments - Block &eblock = op.getBlocks().front(); - for (mlir::BlockArgument arg : eblock.getArguments()) { - Type argt = arg.getType(); - MemRefType mt = dyn_cast(argt); - if (mt) { - if (mt.getElementType().isBF16()) { - MemRefType newMt = dyn_cast( - mt.cloneWith(mt.getShape(), builder.getI16Type())); - arg.setType(newMt); - } - } else if (argt.isBF16()) { - arg.setType(builder.getI16Type()); - } + idx++; + } + for (mlir::OpResult res : o->getResults()) { + if (res.getType().isBF16()) { + res.setType(builder.getF32Type()); + builder.setInsertionPointAfter(o); + auto newRes = builder.create( + o->getLoc(), builder.getBF16Type(), res); + res.replaceAllUsesExcept(newRes, newRes); } - WalkResult result1_2 = op.getRegion().walk( - [&](Operation *lop) -> WalkResult { - if (dyn_cast(lop)) { - // if extf i16 -> f32 : "i16" is not a typo - if (lop->getOperand(0).getType().isInteger(16)) { - if (lop->getResult(0).getType().isF32()) { - builder.setInsertionPoint(lop); - auto bcast = builder.create( - lop->getLoc(), builder.getBF16Type(), - lop->getOperand(0)); - lop->setOperand(0, bcast); - } - } - } else if (dyn_cast(lop)) { - // if truncf f32 -> bf16 - if (lop->getOperand(0).getType().isF32()) { - if (lop->getResult(0).getType().isBF16()) { - builder.setInsertionPointAfter(lop); - auto bcast = builder.create( - lop->getLoc(), builder.getI16Type(), - lop->getResult(0)); - lop->getResult(0).replaceAllUsesExcept(bcast, bcast); - } - } - } else { - if (lop->getNumResults() > 0) { - if (lop->getResultTypes().front().isBF16()) { - lop->getResult(0).setType(builder.getI16Type()); - } - } + } + } + // 1-3: Change element type of entry block arguments + Block &eblock = op.getBlocks().front(); + for (mlir::BlockArgument arg : eblock.getArguments()) { + Type argt = arg.getType(); + MemRefType mt = dyn_cast(argt); + if (mt) { + if (mt.getElementType().isBF16()) { + MemRefType newMt = dyn_cast( + mt.cloneWith(mt.getShape(), builder.getI16Type())); + arg.setType(newMt); + } + } else if (argt.isBF16()) { + arg.setType(builder.getI16Type()); + } + } + (void)op.getRegion().walk( + [&](Operation *lop) -> WalkResult { + if (dyn_cast(lop)) { + // if extf i16 -> f32 : "i16" is not a typo + if (lop->getOperand(0).getType().isInteger(16)) { + if (lop->getResult(0).getType().isF32()) { + builder.setInsertionPoint(lop); + auto bcast = builder.create( + lop->getLoc(), builder.getBF16Type(), lop->getOperand(0)); + lop->setOperand(0, bcast); + } + } + } else if (dyn_cast(lop)) { + // if truncf f32 -> bf16 + if (lop->getOperand(0).getType().isF32()) { + if (lop->getResult(0).getType().isBF16()) { + builder.setInsertionPointAfter(lop); + auto bcast = builder.create( + lop->getLoc(), builder.getI16Type(), lop->getResult(0)); + lop->getResult(0).replaceAllUsesExcept(bcast, bcast); } - return WalkResult::advance(); - }); - return WalkResult::advance(); - }); + } + } else { + if (lop->getNumResults() > 0) { + if (lop->getResultTypes().front().isBF16()) { + lop->getResult(0).setType(builder.getI16Type()); + } + } + } + return WalkResult::advance(); + }); + return WalkResult::advance(); + }); // Part 2: gpu::LaunchFuncOp and gpu::AllocOp SmallVector replacedAllocOps; - WalkResult result2 = mod.walk([&](gpu::LaunchFuncOp op) - -> WalkResult { + (void)mod.walk([&](gpu::LaunchFuncOp op) + -> WalkResult { for (const auto &kop : op.getKernelOperands()) { auto mem = kop; Type memt = mem.getType(); diff --git a/lib/Transforms/LowerMemRefCopy.cpp b/lib/Transforms/LowerMemRefCopy.cpp index a590ba5ea..ebb9622b7 100644 --- a/lib/Transforms/LowerMemRefCopy.cpp +++ b/lib/Transforms/LowerMemRefCopy.cpp @@ -38,46 +38,45 @@ struct LowerMemRefCopy auto &domInfo = getAnalysis(); auto func = getOperation(); // walk through memref.copy ops in the funcion body - WalkResult result = - func.walk([&](memref::CopyOp op) -> WalkResult { - if (op->getParentOp() != func) - return WalkResult::skip(); - auto src = op.getSource(); - auto dst = op.getTarget(); - // supposed to work on same memref type - auto srcType = src.getType().cast(); - auto dstType = dst.getType().cast(); - if (srcType != dstType) - return WalkResult::skip(); - // supposed to work on memref.alloc - auto srcOp = src.getDefiningOp(); - auto dstOp = dst.getDefiningOp(); - if (!srcOp || !dstOp) - return WalkResult::skip(); - // check use of src after this copyOp, being conservative - // FIXME: handle dealloc of src and dst - bool hasSubsequentUse = false; - for (auto user : src.getUsers()) { - if (isa(user)) { - continue; - } - if (domInfo.properlyDominates(op, user)) { - hasSubsequentUse = true; - break; - } - } + (void)func.walk([&](memref::CopyOp op) -> WalkResult { + if (op->getParentOp() != func) + return WalkResult::skip(); + auto src = op.getSource(); + auto dst = op.getTarget(); + // supposed to work on same memref type + auto srcType = src.getType().cast(); + auto dstType = dst.getType().cast(); + if (srcType != dstType) + return WalkResult::skip(); + // supposed to work on memref.alloc + auto srcOp = src.getDefiningOp(); + auto dstOp = dst.getDefiningOp(); + if (!srcOp || !dstOp) + return WalkResult::skip(); + // check use of src after this copyOp, being conservative + // FIXME: handle dealloc of src and dst + bool hasSubsequentUse = false; + for (auto user : src.getUsers()) { + if (isa(user)) { + continue; + } + if (domInfo.properlyDominates(op, user)) { + hasSubsequentUse = true; + break; + } + } - // replace copy with linalg.generic - if (hasSubsequentUse) { - OpBuilder builder(op); - linalg::makeMemRefCopyOp(builder, op.getLoc(), src, dst); - } else { - // coalesce buffer - dst.replaceAllUsesWith(src); - } - op.erase(); - return WalkResult::advance(); - }); + // replace copy with linalg.generic + if (hasSubsequentUse) { + OpBuilder builder(op); + linalg::makeMemRefCopyOp(builder, op.getLoc(), src, dst); + } else { + // coalesce buffer + dst.replaceAllUsesWith(src); + } + op.erase(); + return WalkResult::advance(); + }); } }; } // namespace diff --git a/lib/Utils/XeCommon.cpp b/lib/Utils/XeCommon.cpp index c86be8a78..ee32d7a3e 100644 --- a/lib/Utils/XeCommon.cpp +++ b/lib/Utils/XeCommon.cpp @@ -82,7 +82,7 @@ static mlir::BlockArgument getArgForOperand(mlir::scf::ForOp &forOp, mlir::Value operand) { auto idx = getOperandIndex(forOp, operand); auto numControls = forOp.getNumControlOperands(); - assert(idx >= numControls); + assert(idx >= (int)numControls); return forOp.getRegionIterArg(idx - numControls); };