diff --git a/include/imex/Conversion/CMakeLists.txt b/include/imex/Conversion/CMakeLists.txt index db25c8fc0..635dcd12f 100644 --- a/include/imex/Conversion/CMakeLists.txt +++ b/include/imex/Conversion/CMakeLists.txt @@ -9,3 +9,4 @@ add_subdirectory(DistToStandard) add_subdirectory(DropRegions) add_subdirectory(XeTileToXeGPU) add_subdirectory(XeGPUToVC) +add_subdirectory(MathToVC) diff --git a/include/imex/Conversion/MathToVC/CMakeLists.txt b/include/imex/Conversion/MathToVC/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/include/imex/Conversion/MathToVC/MathToVC.h b/include/imex/Conversion/MathToVC/MathToVC.h new file mode 100644 index 000000000..7128cd874 --- /dev/null +++ b/include/imex/Conversion/MathToVC/MathToVC.h @@ -0,0 +1,51 @@ +//===- MathToVC.h - Conversion---------------*- C++ -*-===// +// +// Copyright 2024 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements conversion of the select math dialect operations into +/// Func dialect calls to vc-intrinsics functions +/// +//===----------------------------------------------------------------------===// +#ifndef IMEX_CONVERSION_MATHTOVC_H +#define IMEX_CONVERSION_MATHTOVC_H + +#include +#include + +#include "imex/Utils/XeCommon.h" + +namespace mlir { + +class ConversionTarget; +class LLVMTypeConverter; +class Pass; +class Operation; +class RewritePatternSet; +template class OperationPass; + +namespace gpu { +class GPUModuleOp; +} // namespace gpu + +} // namespace mlir + +namespace imex { +#define GEN_PASS_DECL_CONVERTMATHTOVC +#include "imex/Conversion/Passes.h.inc" + +void populateMathToVCPatterns( + ::mlir::LLVMTypeConverter &typeConverter, + ::mlir::RewritePatternSet &patterns, + bool enableHighPrecisionInterimCalculation = false); +void configureMathToVCConversionLegality(::mlir::ConversionTarget &target); +std::unique_ptr<::mlir::OperationPass<::mlir::gpu::GPUModuleOp>> +createConvertMathToVCPass(); + +} // namespace imex +#endif diff --git a/include/imex/Conversion/Passes.h b/include/imex/Conversion/Passes.h index d466ba422..0cb9c8c81 100644 --- a/include/imex/Conversion/Passes.h +++ b/include/imex/Conversion/Passes.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td index 40b089050..1ad939d4a 100644 --- a/include/imex/Conversion/Passes.td +++ b/include/imex/Conversion/Passes.td @@ -427,4 +427,34 @@ def ConvertXeGPUToVC : Pass<"convert-xegpu-to-vc", "::mlir::gpu::GPUModuleOp"> { let constructor = "imex::createConvertXeGPUToVCPass()"; } + +//===----------------------------------------------------------------------===// +// MathToVC +//===----------------------------------------------------------------------===// +// high-precision-interim-calculation +def ConvertMathToVC : Pass<"convert-math-to-vc", "::mlir::gpu::GPUModuleOp"> { + let summary = "Generate vc-intrinsics functions for select math dialect operations"; + let description = [{ + Convert select math dialect operations into the Func dialect calls to vc-intrinsics + functions. + Some math operations are not supported by the VC compiler (IGC vector backend) + and need to be converted to vc-intrinsic calls. + This pass converts these math operations to vc-intrinsics. + }]; + let options = [ + Option<"enableHighPrecisionInterimCalculation", "enable-high-precision-interim-calculation", "bool", + /*default=*/"false", + "Enables high precision (f32) interim calculation for math operations." + "For any interim instruction added as part of the conversion will be high precision(f32)."> + ]; + + let dependentDialects = ["::mlir::math::MathDialect", + "::mlir::vector::VectorDialect", + "::mlir::LLVM::LLVMDialect", + "::mlir::func::FuncDialect", + "::mlir::arith::ArithDialect" + ]; + let constructor = "imex::createConvertMathToVCPass()"; +} + #endif // _IMEX_CONVERSION_PASSES_TD_INCLUDED_ diff --git a/lib/Conversion/XeGPUToVC/Utils.h b/include/imex/Utils/VCUtils.h similarity index 89% rename from lib/Conversion/XeGPUToVC/Utils.h rename to include/imex/Utils/VCUtils.h index 0f9c06fb9..20ac41811 100644 --- a/lib/Conversion/XeGPUToVC/Utils.h +++ b/include/imex/Utils/VCUtils.h @@ -8,14 +8,20 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines some utils used in XeGPUToVC pass +/// This file defines some utils used in ConversionToVC passes /// //===----------------------------------------------------------------------===// -#ifndef XEGPU_VC_UTILS_H -#define XEGPU_VC_UTILS_H +#ifndef VC_UTILS_H +#define VC_UTILS_H +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" #include "mlir/IR/Builders.h" +#include "mlir/Transforms/DialectConversion.h" using namespace mlir; diff --git a/include/imex/Utils/XeCommon.h b/include/imex/Utils/XeCommon.h index d14603e10..f2d08884a 100644 --- a/include/imex/Utils/XeCommon.h +++ b/include/imex/Utils/XeCommon.h @@ -617,6 +617,12 @@ llvm::SmallVector swapLastTwoElements(llvm::ArrayRef shape) { /// output strides = 60x20x5x1 llvm::SmallVector defaultStrides(llvm::ArrayRef shape); +/// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL +/// capability. In other words, the vector size is not supported by SPIR-V. +/// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16 +/// capability). +bool isVectorAnyINTELType(mlir::Type type); + } // namespace imex #endif diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index 813781dd3..b52337e57 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -4,5 +4,6 @@ add_subdirectory(DropRegions) add_subdirectory(GPUToSPIRV) add_subdirectory(GPUToGPUX) add_subdirectory(GPUXToLLVM) +add_subdirectory(MathToVC) add_subdirectory(XeTileToXeGPU) add_subdirectory(XeGPUToVC) diff --git a/lib/Conversion/MathToVC/CMakeLists.txt b/lib/Conversion/MathToVC/CMakeLists.txt new file mode 100644 index 000000000..ab5d76327 --- /dev/null +++ b/lib/Conversion/MathToVC/CMakeLists.txt @@ -0,0 +1,21 @@ +add_imex_conversion_library(IMEXMathToVC + MathToVC.cpp + + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/MathToVC + + DEPENDS + IMEXConversionPassIncGen + + #LINK_COMPONENTS + + LINK_LIBS PUBLIC + MLIRIR + MLIRSupport + # MLIRTransforms + MLIRLLVMCommonConversion + + MLIRGPUDialect + MLIRPass + ) diff --git a/lib/Conversion/MathToVC/MathToVC.cpp b/lib/Conversion/MathToVC/MathToVC.cpp new file mode 100644 index 000000000..0e0121c91 --- /dev/null +++ b/lib/Conversion/MathToVC/MathToVC.cpp @@ -0,0 +1,318 @@ +//===- MathToVC.cpp - Conversion---------------*- C++ -*-===// +// +// Copyright 2024 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements conversion of the select math dialect operations into +/// Func dialect calls to vc-intrinsics functions +/// +//===----------------------------------------------------------------------===// + +#include "imex/Conversion/MathToVC/MathToVC.h" +#include "imex/Utils/VCUtils.h" +#include "imex/Utils/XeCommon.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/IR/BuiltinDialect.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Value.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/FormatVariadic.h" + +namespace imex { +#define GEN_PASS_DEF_CONVERTMATHTOVC +#include "imex/Conversion/Passes.h.inc" +} // namespace imex + +using namespace mlir; +using namespace imex; + +namespace { +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +// Get the VC intrinsic name for the given math operation +template std::string getVCIntrinsicName() { + constexpr bool isCeilOp = std::is_same_v; + constexpr bool isFloorOp = std::is_same_v; + constexpr bool isExpOp = std::is_same_v; + constexpr bool isExp2Op = std::is_same_v; + if (isCeilOp) + return "llvm.genx.rndu."; + else if (isFloorOp) + return "llvm.genx.rndd."; + else if (isExpOp || isExp2Op) + return "llvm.genx.exp."; + else + assert(0 && "Unsupported math Op. Add more support!"); +} + +// Utility function to convert a scalar or vector type of any float bitwidth to +// another. +Type convertScalarOrVectorFloatType(Type srcType, Type dstElementType) { + // get a vector type or scalar type of dstElementType with the same shape as + // srcType + if (auto vecTy = dyn_cast(srcType)) { + auto newTy = VectorType::get(vecTy.getShape(), dstElementType); + return newTy; + } else if (auto scalarTy = dyn_cast(srcType)) { + return dstElementType; + } else { + assert(0 && "Unsupported type"); + } +} + +// Utility function to convert a range float args to a specific float type +// The function converts the float args to the dstElementType +// It generates an extension or truncation operation if the bitwidth of the src +// and dst types are different +SmallVector convertFloatArgsType(SmallVector args, + Type dstElementType, + ConversionPatternRewriter &rewriter) { + SmallVector newArgs; + auto dstBitWidth = dstElementType.getIntOrFloatBitWidth(); + for (auto arg : args) { + // Assert if src and dst types are not float types + assert(((isa(arg.getType()) || + isa( + dyn_cast(arg.getType()).getElementType())) && + isa(dstElementType)) && + "Unsupported type, src and dst both should be float types"); + auto srcBitWidth = + dyn_cast(arg.getType()) + ? dyn_cast(arg.getType()).getElementTypeBitWidth() + : arg.getType().getIntOrFloatBitWidth(); + + if (srcBitWidth == dstBitWidth) + newArgs.push_back(arg); + else if (srcBitWidth < dstBitWidth) { + auto newType = + convertScalarOrVectorFloatType(arg.getType(), dstElementType); + auto newOp = rewriter.create(arg.getLoc(), newType, arg); + newArgs.push_back(newOp); + } else if (srcBitWidth > dstBitWidth) { + auto newType = + convertScalarOrVectorFloatType(arg.getType(), dstElementType); + auto newOp = rewriter.create(arg.getLoc(), newType, arg); + newArgs.push_back(newOp); + } + } + return newArgs; +} +//===----------------------------------------------------------------------===// +// Operation conversion +//===----------------------------------------------------------------------===// + +// Elementwise math to vc-intrinsics conversion pattern for ops that only +// supports f32 +template +struct ElementwiseFloatOnlyMathOpPattern final + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(MOp op, typename MOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type opElementType; + // Check if the result type is a 1D vector + if (auto vecTy = dyn_cast(op.getType())) { + if (vecTy.getRank() != 1) + return failure(); + opElementType = vecTy.getElementType(); + } else { + opElementType = op.getType(); + } + auto loc = op.getLoc(); + auto args = adaptor.getOperands(); + // Upconvert or downconvert all the operands' element types to f32 + // Warning message here for the truncation. If we are truncating + // the value, the result can be different from the original value. + if (opElementType.getIntOrFloatBitWidth() > 32) + emitWarning(op.getLoc(), "Truncation is done on input during conversion, " + "may result in wrong result.\n"); + llvm::SmallVector newArgs = + convertFloatArgsType(args, rewriter.getF32Type(), rewriter); + + // Result element type is always f32 + auto newType = + convertScalarOrVectorFloatType(op.getType(), rewriter.getF32Type()); + std::string resStr = "f32"; + resStr.insert( + 0, ((dyn_cast(newType)) + ? llvm::formatv("v{0}", + dyn_cast(newType).getNumElements()) + .str() + : "")); + + // for large vectors, generate the corresponding VC intrinsic. + auto funcName = getVCIntrinsicName(); + funcName += resStr; + auto callOp = + createFuncCall(rewriter, loc, funcName, {newType}, newArgs, false); + + // Initialize a smallvector with the callOp + SmallVector callOpResult; + callOpResult.push_back(callOp.getResult(0)); + + // Convert the result of the call to the original type + auto originalResultType = + convertFloatArgsType(callOpResult, opElementType, rewriter); + + rewriter.replaceOp(op, originalResultType); + return success(); + } +}; + +// ExpOp conversion pattern, supports both math::exp and math::exp2 +template +struct ExpOpPattern final : public OpConversionPattern { + ExpOpPattern(MLIRContext *ctx, bool enableHighPrecisionInterimCalculation) + : OpConversionPattern(ctx), + enableHighPrecisionInterimCalculation( + enableHighPrecisionInterimCalculation) {} + LogicalResult + matchAndRewrite(ExpOp op, typename ExpOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto vecTy = dyn_cast(op.getType()); + + // Only deal with Exp op with 1-D vector type + if (!(vecTy && vecTy.getRank() == 1)) + return failure(); + + auto loc = op.getLoc(); + + // "llvm.genx.exp" returns the base 2 exponentiation of the input. + // To get the base e exponentiation, we need to scale the input by log2(e) + bool isExpOp = std::is_same_v; + auto operands = adaptor.getOperands(); + SmallVector args{operands}; + // Create a constant vector with the value of 1.442695040888963 + if (isExpOp) { + // Create the intermediate instructions of f32 vector type if the element + // type is less than 32 bits + if (this->enableHighPrecisionInterimCalculation && + vecTy.getElementType().getIntOrFloatBitWidth() < 32) { + auto interimVectorType = + VectorType::get(vecTy.getShape(), rewriter.getF32Type()); + auto vecAttr = DenseElementsAttr::get( + interimVectorType, + rewriter.getFloatAttr(interimVectorType.getElementType(), + 1.442695040888963)); + auto log2eConstVec = + rewriter.create(loc, interimVectorType, vecAttr); + auto input = convertFloatArgsType({operands[0]}, rewriter.getF32Type(), + rewriter); + auto scaledInputf32 = rewriter.create( + op.getLoc(), input[0], log2eConstVec); + auto scaledInput = convertFloatArgsType( + {scaledInputf32}, vecTy.getElementType(), rewriter); + args.clear(); + args.push_back(scaledInput[0]); + } else { + auto vecAttr = DenseElementsAttr::get( + vecTy, + rewriter.getFloatAttr(vecTy.getElementType(), 1.442695040888963)); + auto log2eConstVec = + rewriter.create(loc, vecTy, vecAttr); + auto input = operands[0]; + auto scaledInput = + rewriter.create(op.getLoc(), input, log2eConstVec); + args.clear(); + args.push_back(scaledInput); + } + } + // for large vectors, generate the corresponding VC intrinsic. + auto funcName = getVCIntrinsicName(); + funcName += encodeVectorType(rewriter, vecTy).first; + auto callOp = + createFuncCall(rewriter, loc, funcName, {op.getType()}, args, false); + rewriter.replaceOp(op, callOp); + return success(); + } + +private: + const bool enableHighPrecisionInterimCalculation; +}; + +} // namespace + +//===----------------------------------------------------------------------===// +// Pattern Population +//===----------------------------------------------------------------------===// + +void imex::populateMathToVCPatterns( + ::mlir::LLVMTypeConverter &typeConverter, + ::mlir::RewritePatternSet &patterns, + bool enableHighPrecisionInterimCalculation) { + // Add patterns + patterns.add, + ElementwiseFloatOnlyMathOpPattern>( + patterns.getContext()); + patterns.add, ExpOpPattern>( + patterns.getContext(), enableHighPrecisionInterimCalculation); +} + +//===----------------------------------------------------------------------===// +// Conversion Legality configuration +//===----------------------------------------------------------------------===// +void imex::configureMathToVCConversionLegality( + ::mlir::ConversionTarget &target) { + // Add legal dialects + target.addLegalDialect(); + // math.exp and math.exp2 is only converted if they are 1D vectors + target.addDynamicallyLegalOp([&](Operation *op) { + if (auto vecTy = dyn_cast(op->getResult(0).getType())) { + if (vecTy.getRank() != 1) + return true; + return false; + } + return true; + }); +} + +//===----------------------------------------------------------------------===// +// Pass Definition +//===----------------------------------------------------------------------===// + +namespace { +struct MathToVCPass : public imex::impl::ConvertMathToVCBase { + using Base::Base; + MathToVCPass(bool emitDeallocs) + : imex::impl::ConvertMathToVCBase() { + this->enableHighPrecisionInterimCalculation.setValue(emitDeallocs); + } + void runOnOperation() override { + gpu::GPUModuleOp m = getOperation(); + LLVMTypeConverter typeConverter(&getContext()); + ConversionTarget target(getContext()); + RewritePatternSet patterns(&getContext()); + + // Add patterns + imex::populateMathToVCPatterns( + typeConverter, patterns, + this->enableHighPrecisionInterimCalculation.getValue()); + configureMathToVCConversionLegality(target); + + if (failed(applyPartialConversion(m, target, std::move(patterns)))) + return signalPassFailure(); + } +}; + +} // namespace + +std::unique_ptr> +imex::createConvertMathToVCPass() { + return std::make_unique(); +} diff --git a/lib/Conversion/XeGPUToVC/CMakeLists.txt b/lib/Conversion/XeGPUToVC/CMakeLists.txt index f3f0d6699..262a04ec9 100644 --- a/lib/Conversion/XeGPUToVC/CMakeLists.txt +++ b/lib/Conversion/XeGPUToVC/CMakeLists.txt @@ -1,7 +1,6 @@ add_imex_conversion_library(IMEXXeGPUToVC LSCPatterns.cpp XeGPUToVC.cpp - Utils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/XeGPUToVC diff --git a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp index c6dfec61b..4c81c523b 100644 --- a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp +++ b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp @@ -38,7 +38,7 @@ #include "llvm/Support/FormatVariadic.h" #include "LscIntrinsicEnums.h" -#include "Utils.h" +#include "imex/Utils/VCUtils.h" using namespace mlir; using mlir::xegpu::AtomicRMWOp; diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp index ed9369a4c..19ea6d699 100644 --- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp +++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp @@ -34,13 +34,13 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "imex/Conversion/MathToVC/MathToVC.h" +#include "imex/Utils/VCUtils.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormatVariadic.h" -#include "Utils.h" - namespace imex { #define GEN_PASS_DEF_CONVERTXEGPUTOVC #include "imex/Conversion/Passes.h.inc" @@ -889,15 +889,6 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase { return true; }); - target.addDynamicallyLegalOp([&](math::ExpOp op) { - if (auto vecTy = dyn_cast(op.getType())) { - if (vecTy.getRank() != 1) - return true; - return false; - } - return true; - }); - target.addIllegalOp(); // TODO: can we change it to addDynamicLegalOp? @@ -962,8 +953,7 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase { // Ops to llvm.genx only Patterns patterns.add, - ElementwiseToVCPattern, DpasPattern, + ElementwiseToVCPattern, DpasPattern, NbarrierArrivePattern>(patterns.getContext()); // Ops to LSC only patterns @@ -971,6 +961,10 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase { populateLoadStoreLSCPatterns(typeConverter, patterns); + populateMathToVCPatterns(typeConverter, patterns); + + configureMathToVCConversionLegality(target); + if (failed(applyPartialConversion(m, target, std::move(patterns)))) return signalPassFailure(); } diff --git a/lib/Utils/CMakeLists.txt b/lib/Utils/CMakeLists.txt index 7b6ad3baa..75dcf17cf 100644 --- a/lib/Utils/CMakeLists.txt +++ b/lib/Utils/CMakeLists.txt @@ -2,6 +2,7 @@ add_mlir_library(IMEXUtil FuncUtils.cpp PassUtils.cpp TypeConversion.cpp + VCUtils.cpp XeCommon.cpp XeArch.cpp diff --git a/lib/Conversion/XeGPUToVC/Utils.cpp b/lib/Utils/VCUtils.cpp similarity index 95% rename from lib/Conversion/XeGPUToVC/Utils.cpp rename to lib/Utils/VCUtils.cpp index 7efc9efd0..f2c4f2c2f 100644 --- a/lib/Conversion/XeGPUToVC/Utils.cpp +++ b/lib/Utils/VCUtils.cpp @@ -8,10 +8,11 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file implements some utils used in XeGPUToVC pass +/// This file implements some utils used in ConversionToVC passes /// //===----------------------------------------------------------------------===// +#include "imex/Utils/VCUtils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -28,7 +29,7 @@ FlatSymbolRefAttr getFuncRefAttr(gpu::GPUModuleOp module, StringRef name, TypeRange resultType, ValueRange operands, bool isVectorComputeFunction, bool emitCInterface, - bool emitSPIRVLinkage = true) { + bool emitSPIRVLinkage /* = true*/) { MLIRContext *context = module.getContext(); auto result = SymbolRefAttr::get(context, name); diff --git a/lib/Utils/XeCommon.cpp b/lib/Utils/XeCommon.cpp index e4a71c197..c701d9155 100644 --- a/lib/Utils/XeCommon.cpp +++ b/lib/Utils/XeCommon.cpp @@ -12,10 +12,10 @@ /// routines used by Xe related dialects. /// //===----------------------------------------------------------------------===// - #include #include #include +#include #include "imex/Dialect/XeTile/IR/XeTileOps.h" #include "imex/Utils/DebugUtils.h" @@ -211,4 +211,16 @@ mlir::TypedValue stack(mlir::Value vecUp, mlir::Value vecDown, return op; } +/// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL +/// capability. In other words, the vector size is not supported by SPIR-V. +/// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16 +/// capability). +bool isVectorAnyINTELType(mlir::Type type) { + std::unordered_set spirvSupportedSizes = {2, 3, 4, 8, 16}; + auto vecType = mlir::dyn_cast(type); + return vecType && vecType.getRank() == 1 && + (spirvSupportedSizes.find(vecType.getNumElements()) == + spirvSupportedSizes.end()); +} + } // namespace imex diff --git a/test/Conversion/MathToVC/math-to-vc.mlir b/test/Conversion/MathToVC/math-to-vc.mlir new file mode 100644 index 000000000..f0fb831dd --- /dev/null +++ b/test/Conversion/MathToVC/math-to-vc.mlir @@ -0,0 +1,78 @@ +// RUN: imex-opt -convert-math-to-vc -verify-diagnostics %s | FileCheck %s --check-prefixes=CHECK +// RUN: imex-opt -convert-math-to-vc="enable-high-precision-interim-calculation=true" -verify-diagnostics %s | FileCheck %s --check-prefixes=HIGH_PRECISION + +module @gemm attributes {gpu.container_module} { + gpu.module @math_to_vc { + // CHECK-LABEL: gpu.func @ceil_f16 + gpu.func @ceil_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ + %c0 = arith.constant 0 : index + %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16> + // CHECK: %[[EXTF_F32:.*]] = arith.extf {{.*}} : vector<16xf16> to vector<16xf32> + // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndu.v16f32(%[[EXTF_F32]]) : (vector<16xf32>) -> vector<16xf32> + // CHECK-NEXT: %[[TRUNC_F16:.*]] = arith.truncf %[[CEILF]] : vector<16xf32> to vector<16xf16> + %2 = math.ceil %0 : vector<16xf16> + gpu.return + } + + // ----- + + // CHECK-LABEL: gpu.func @ceil_f64 + gpu.func @ceil_f64(%arg0: memref<8x16xf64>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ + %c0 = arith.constant 0 : index + %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf64>, vector<16xf64> + // CHECK: %[[TRUNCF_F32:.*]] = arith.truncf {{.*}} : vector<16xf64> to vector<16xf32> + // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndu.v16f32(%[[TRUNCF_F32]]) : (vector<16xf32>) -> vector<16xf32> + // CHECK-NEXT: %[[EXTF_F64:.*]] = arith.extf %[[CEILF]] : vector<16xf32> to vector<16xf64> + // expected-warning@+1 {{Truncation is done on input during conversion, may result in wrong result.}} + %2 = math.ceil %0 : vector<16xf64> + gpu.return + } + + // ----- + + // CHECK-LABEL: gpu.func @floor_f16 + gpu.func @floor_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ + %c0 = arith.constant 0 : index + %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16> + // CHECK: %[[EXTF_F32:.*]] = arith.extf {{.*}} : vector<16xf16> to vector<16xf32> + // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndd.v16f32(%[[EXTF_F32]]) : (vector<16xf32>) -> vector<16xf32> + // CHECK-NEXT: %[[TRUNC_F16:.*]] = arith.truncf %[[CEILF]] : vector<16xf32> to vector<16xf16> + %2 = math.floor %0 : vector<16xf16> + gpu.return + } + + // ----- + + // CHECK-LABEL: gpu.func @exp_f16 + // HIGH_PRECISION-LABEL: gpu.func @exp_f16 + gpu.func @exp_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ + %c0 = arith.constant 0 : index + // HIGH_PRECISION: %[[VEC:.*]] = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16> + %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16> + // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf16> + // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]] + // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[MULF]]) + // HIGH_PRECISION: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf32> + // HIGH_PRECISION: %[[VEC_F32:.*]] = arith.extf %[[VEC]] : vector<16xf16> to vector<16xf32> + // HIGH_PRECISION: %[[MULF_F32:.*]] = arith.mulf %[[VEC_F32]], %[[LOG2E_VEC]] : vector<16xf32> + // HIGH_PRECISION: %[[MULF:.*]] = arith.truncf %[[MULF_F32]] : vector<16xf32> to vector<16xf16> + // HIGH_PRECISION: func.call @llvm.genx.exp.v8i32(%[[MULF]]) + %2 = math.exp %v1 : vector<16xf16> + gpu.return + } + + // ----- + + // CHECK-LABEL: gpu.func @exp2_f16 + gpu.func @exp2_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ + %c0 = arith.constant 0 : index + // CHECK: %[[VEC:.*]] = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16> + %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16> + // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[VEC]]) + %2 = math.exp2 %v1 : vector<16xf16> + gpu.return + } + } +} + +// ----- diff --git a/test/Conversion/XeGPUToVC/eltwise.mlir b/test/Conversion/XeGPUToVC/eltwise.mlir index f177e0f2d..24d59e3ac 100644 --- a/test/Conversion/XeGPUToVC/eltwise.mlir +++ b/test/Conversion/XeGPUToVC/eltwise.mlir @@ -7,8 +7,7 @@ module @gemm attributes {gpu.container_module} { %c0 = arith.constant 0 : index %cv1 = arith.constant dense<1.0> : vector<16xf32> %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf32>, vector<16xf32> - // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f32 - // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f32 to vector<16xf32> + // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf32> // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]] // CHECK-NEXT: func.call @llvm.genx.exp.v16f32(%[[MULF]]) %1 = math.exp %v1 fastmath : vector<16xf32> @@ -31,8 +30,7 @@ module @gemm attributes {gpu.container_module} { gpu.func @exp_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ %c0 = arith.constant 0 : index %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16> - // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f16 - // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f16 to vector<16xf16> + // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf16> // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]] // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[MULF]]) %2 = math.exp %v1 : vector<16xf16> diff --git a/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir b/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir new file mode 100644 index 000000000..c6498e8d7 --- /dev/null +++ b/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir @@ -0,0 +1,53 @@ +// RUN: IMEX_USE_IGC_VECTOR_BACK_END=1 %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: IMEX_USE_IGC_VECTOR_BACK_END=1 %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck + +module @eltwise_add attributes {gpu.container_module} { + memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<5.000000e-01> + func.func @test(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>) -> memref<10x20xbf16> { + %c20 = arith.constant 20 : index + %c10 = arith.constant 10 : index + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<10x20xbf16> + memref.copy %arg1, %memref : memref<10x20xbf16> to memref<10x20xbf16> + %memref_0 = gpu.alloc host_shared () : memref<10x20xbf16> + memref.copy %arg0, %memref_0 : memref<10x20xbf16> to memref<10x20xbf16> + %memref_1 = gpu.alloc host_shared () : memref<10x20xbf16> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c10, %c20, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<10x20xbf16>, %memref : memref<10x20xbf16>, %memref_1 : memref<10x20xbf16>) + %alloc = memref.alloc() : memref<10x20xbf16> + memref.copy %memref_1, %alloc : memref<10x20xbf16> to memref<10x20xbf16> + gpu.dealloc %memref_1 : memref<10x20xbf16> + gpu.dealloc %memref_0 : memref<10x20xbf16> + gpu.dealloc %memref : memref<10x20xbf16> + return %alloc : memref<10x20xbf16> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>, %arg2: memref<10x20xbf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %0 = memref.load %arg0[%block_id_x, %block_id_y] : memref<10x20xbf16> + %1 = memref.load %arg1[%block_id_x, %block_id_y] : memref<10x20xbf16> + %2 = math.ceil %0 : bf16 + %3 = math.floor %1 : bf16 + %4 = arith.addf %2, %3 : bf16 + memref.store %4, %arg2[%block_id_x, %block_id_y] : memref<10x20xbf16> + gpu.return + } + } + func.func @main() { + %0 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16> + %1 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16> + %2 = call @test(%0, %1) : (memref<10x20xbf16>, memref<10x20xbf16>) -> memref<10x20xbf16> + %cast = memref.cast %2 : memref<10x20xbf16> to memref<*xbf16> + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} + // CHECK-COUNT-200: 1 + call @printMemrefBF16(%cast) : (memref<*xbf16>) -> () + return + } + func.func private @printMemrefBF16(memref<*xbf16>) attributes {llvm.emit_c_interface} +} diff --git a/test/Integration/Dialect/Gpu/gpu-to-llvm.pp b/test/Integration/Dialect/Gpu/gpu-to-llvm.pp index 9effd7727..5ed75515b 100644 --- a/test/Integration/Dialect/Gpu/gpu-to-llvm.pp +++ b/test/Integration/Dialect/Gpu/gpu-to-llvm.pp @@ -4,6 +4,7 @@ // Ready for imex runner starting from GPU dialect. builtin.module( imex-vector-linearize + gpu.module(convert-math-to-vc{enable-high-precision-interim-calculation=true}) reconcile-unrealized-casts bf16-to-gpu imex-convert-gpu-to-spirv diff --git a/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir b/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir new file mode 100644 index 000000000..9ee678a7c --- /dev/null +++ b/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir @@ -0,0 +1,84 @@ +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck +module @gemm attributes {gpu.container_module} { + memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<0.0> + func.func @test(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + + %memref = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %arg0, %memref : memref<8x16xf32> to memref<8x16xf32> + %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %arg1, %memref_1 : memref<8x16xf32> to memref<8x16xf32> + %memref_2 = gpu.alloc host_shared () : memref<8x16xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c8, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_1 : memref<8x16xf32>, %memref_2 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf32> + gpu.dealloc %memref_1 : memref<8x16xf32> + return %memref_2 : memref<8x16xf32> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %thread_id_x = gpu.thread_id x + cf.br ^bb1 + ^bb1: + %0 = xegpu.create_nd_tdesc %arg1[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + %2 = xegpu.create_nd_tdesc %arg0[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32> + %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + %4 = math.ceil %1 : vector<16xf32> + %5 = math.floor %3 : vector<16xf32> + %6 = arith.addf %4, %5 : vector<16xf32> + %7 = xegpu.create_nd_tdesc %arg2[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32> + xegpu.store_nd %6, %7 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + gpu.return + } + } + func.func @main() attributes {llvm.emit_c_interface} { + %c_gen_int = arith.constant 0 : i1 + %cf_lower = arith.constant -0.5 : f32 + %cf_upper = arith.constant 0.5 : f32 + + %A = memref.alloc() : memref<8x16xf32> + %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @fillResource1DRandomF32(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> () + + %B = memref.alloc() : memref<8x16xf32> + %B_random = memref.cast %B : memref<8x16xf32> to memref<*xf32> + call @fillResource1DRandomF32(%B_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> () + + // calculate the result C matrix + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %ref = memref.alloc() : memref<8x16xf32> + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + %a = memref.load %A[%i, %j] : memref<8x16xf32> + %b = memref.load %B[%i, %j] : memref<8x16xf32> + %a_ceiled = math.ceil %a : f32 + %b_floored = math.floor %b : f32 + %c = arith.addf %a_ceiled, %b_floored : f32 + memref.store %c, %ref[%i, %j] : memref<8x16xf32> + } + } + + %C = call @test(%A, %B) : (memref<8x16xf32>, memref<8x16xf32>) -> memref<8x16xf32> + + %C_cast = memref.cast %C : memref<8x16xf32> to memref<*xf32> + %ref_cast = memref.cast %ref : memref<8x16xf32> to memref<*xf32> + // call @printMemrefF32(%C_cast) : (memref<*xf32>) -> () + // CHECK: [ALLCLOSE: TRUE] + call @printAllcloseF32(%ref_cast, %C_cast) : (memref<*xf32>, memref<*xf32>) -> () + return + } + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} + func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface} + func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} +} diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp b/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp index c3ca033ea..faa5343c5 100644 --- a/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp +++ b/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp @@ -7,6 +7,7 @@ imex-xegpu-apply-vnni-transformation, imex-xegpu-optimize-transpose) imex-vector-linearize + gpu.module(convert-math-to-vc{enable-high-precision-interim-calculation=true}) gpu.module(convert-xegpu-to-vc) cse reconcile-unrealized-casts