From c6de5d105c9482be9255857aee35c0e93edd015a Mon Sep 17 00:00:00 2001 From: David <110815347+Dewei-Wang-sh@users.noreply.github.com> Date: Sat, 11 Nov 2023 00:24:24 +0800 Subject: [PATCH] [Conversion] lowering XeGPU.ops to VC-Intrinsics (#669) load1d/store1d/barrier/gather/scatter/atomic/udpateoffset are all lowered; related xegpu tests run correctly --- include/imex/Conversion/Passes.td | 4 + include/imex/Dialect/XeGPU/IR/XeGPUOps.td | 46 +- lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp | 5 +- lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp | 1074 ++++++++++++++--- lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 24 +- .../Conversion/XeGPUToSPIRV/atomic_basic.mlir | 30 + .../XeGPUToSPIRV/barrier_basic.mlir | 44 + .../gemm_basic.mlir | 5 + .../XeGPUToSPIRV/gemm_basic_1d.mlir | 49 + .../XeGPUToSPIRV/gemm_basic_gather.mlir | 53 + .../XeGPUToSPIRV/update_offset.mlir | 47 + .../xegpu-to-llvm.pp | 0 test/Conversion/XeGPUToVC/lit.local.cfg | 3 - test/Dialect/XeGPU/IR/atomic_rmw.mlir | 12 +- test/Dialect/XeGPU/IR/barrier_ops.mlir | 4 +- .../Dialect/XeGPU/gemm_1024x1024xf16.mlir} | 0 test/Integration/Dialect/XeGPU/lit.local.cfg | 3 + .../Dialect/XeGPU/xegpu-to-llvm.pp | 17 + 18 files changed, 1183 insertions(+), 237 deletions(-) create mode 100644 test/Conversion/XeGPUToSPIRV/atomic_basic.mlir create mode 100644 test/Conversion/XeGPUToSPIRV/barrier_basic.mlir rename test/Conversion/{XeGPUToVC => XeGPUToSPIRV}/gemm_basic.mlir (92%) create mode 100644 test/Conversion/XeGPUToSPIRV/gemm_basic_1d.mlir create mode 100644 test/Conversion/XeGPUToSPIRV/gemm_basic_gather.mlir create mode 100644 test/Conversion/XeGPUToSPIRV/update_offset.mlir rename test/Conversion/{XeGPUToVC => XeGPUToSPIRV}/xegpu-to-llvm.pp (100%) delete mode 100644 test/Conversion/XeGPUToVC/lit.local.cfg rename test/{Conversion/XeGPUToVC/gemm_1024x1024xf16.runnable.mlir => Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir} (100%) create mode 100644 test/Integration/Dialect/XeGPU/lit.local.cfg create mode 100644 test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td index 88a0f03e3..b5d43eed8 100644 --- a/include/imex/Conversion/Passes.td +++ b/include/imex/Conversion/Passes.td @@ -250,6 +250,10 @@ memref, arith and math. }]; let constructor = "imex::createConvertGPUXToSPIRVPass()"; let dependentDialects = ["::mlir::spirv::SPIRVDialect"]; + let options = [ + Option<"enableSimtIntrinsic", "enable-simt-intrinsic","bool", "false", + "Enable XeGPU.simt Ops lowered to intel genISA simt Intrinsics"> + ]; } //===----------------------------------------------------------------------===// diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td index c5f9c7d49..3c8c9018f 100644 --- a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td +++ b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td @@ -575,38 +575,28 @@ def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> { } -def XeGPU_AtomicRMWOp - : XeGPU_Op<"atomic_rmw", []> { - let summary = "performa ready-modify-write operation that is free from data races."; - - let arguments = (ins - XeGPU_AtomicRMWKindAttr:$kind, - XeGPU_Vector2DType:$value, - XeGPU_TensorDesc:$tensorDesc, - XeGPU_MaskType:$mask, - DefaultValuedAttr: $mode - ); - - let assemblyFormat = [{ - $kind $value `,` $tensorDesc `,` $mask (`{` `mode` `=` $mode^ `}`)? - attr-dict - `:` `(` qualified(type($value)) `,` qualified(type($tensorDesc)) `,` qualified(type($mask)) `)` - }]; - } +def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", []> { + let summary = "perform ready-modify-write operation that is free from data races."; + let arguments = (ins + XeGPU_AtomicRMWKindAttr:$kind, + XeGPU_TensorDesc:$tensorDesc, + XeGPU_MaskType:$mask, + Optional:$value, + DefaultValuedAttr: $mode + ); + let results = (outs XeGPU_ValueType:$result); + let assemblyFormat = [{ + $kind $tensorDesc `,` $mask (`,` $value^)? (`{` `mode` `=` $mode^ `}`)? attr-dict `:` qualified(type(operands)) `->` type($result) + }]; +} -def XeGPU_AllocNbarrierOp - : XeGPU_Op<"alloc_nbarrier", []> { +def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> { let summary = "allocate a specific number of named barriers."; + let arguments = (ins I32Attr: $nbarrierCount); + let assemblyFormat = "$nbarrierCount attr-dict"; +} - let arguments = (ins - I8: $nbarrier_count - ); - - let assemblyFormat = [{ - $nbarrier_count attr-dict `:` qualified(type($nbarrier_count)) - }]; - } def XeGPU_CreateNbarrierOp : XeGPU_Op<"create_nbarrier", []> { diff --git a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp index 2c839bb33..cc35fe96c 100644 --- a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp +++ b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp @@ -158,7 +158,8 @@ void GPUXToSPIRVPass::runOnOperation() { target->addIllegalDialect(); typeConverter.addConversion( [&](xegpu::TensorDescType type) -> ::mlir::Type { - return ::mlir::IntegerType::get(context, 64); + auto i64Type = ::mlir::IntegerType::get(context, 64); + return ::mlir::VectorType::get(2, i64Type); }); typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type { unsigned rank = type.getRank(); @@ -175,6 +176,8 @@ void GPUXToSPIRVPass::runOnOperation() { for (unsigned i = 0; i < rank; i++) { sum *= type.getShape()[i]; } + if (llvm::isa(elemType)) + elemType = ::mlir::IntegerType::get(context, 64); return ::mlir::VectorType::get(sum, elemType); } }); diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp index 6685e3019..10a508b8a 100644 --- a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp +++ b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp @@ -19,9 +19,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -33,18 +35,15 @@ using namespace imex::xegpu; using namespace mlir; namespace { -/// @brief encodeVectorType(xxx, 8x8x2xf16, true) returns ["v64i32", 64xi32] +/// @brief encodeVectorType(xxx, 8x8x2xf16, false) returns ["v64i32", 64xi32] std::pair encodeVectorType(ConversionPatternRewriter &rewriter, VectorType type, - bool cast = true) { + bool use64bitData = false, bool enforceInteger = false) { auto elemType = type.getElementType(); auto bitWidth = elemType.getIntOrFloatBitWidth(); - auto rank = type.getRank(); - auto shape = type.getShape(); - auto size = shape[0] * shape[1]; - if (!cast && bitWidth == 16) { - assert(shape[rank - 1] == 2); - size *= 2; + int size = type.getNumElements() * bitWidth / 32; + if (use64bitData) { + size /= 2; } std::string str; switch (size) { @@ -64,16 +63,17 @@ encodeVectorType(ConversionPatternRewriter &rewriter, VectorType type, assert(0 && "add more support"); break; } - if (elemType == rewriter.getF32Type()) + if (use64bitData) { + str += "i64"; + elemType = rewriter.getI64Type(); + } else if (enforceInteger) { + str += "i32"; + elemType = rewriter.getI32Type(); + } else if (elemType == rewriter.getF32Type()) str += "f32"; else if (elemType == rewriter.getF16Type()) { - if (cast) { - assert(shape[rank - 1] == 2); - str += "i32"; - elemType = rewriter.getI32Type(); - } else { - str += "f16"; - } + str += "i32"; + elemType = rewriter.getI32Type(); } else assert(0 && "add more support"); auto newType = VectorType::get(size, elemType); @@ -95,6 +95,110 @@ unsigned encodeDataum(Type type) { } } +template unsigned encodeCacheHint(OpType op) { + auto l1hint = op.getL1Hint(); + // auto l2hint = op.getL2Hint(); + auto l3hint = op.getL3Hint(); + constexpr bool isWrite = std::is_same_v || + std::is_same_v; + unsigned cacheHint = 1; + if constexpr (!isWrite) { + auto l1CacheValue = + l1hint.has_value() ? l1hint.value() : xegpu::CacheReadHint::UNCACHED; + auto l3CacheValue = + l3hint.has_value() ? l3hint.value() : xegpu::CacheReadHint::UNCACHED; + if (l1CacheValue == xegpu::CacheReadHint::UNCACHED) { + if (l3CacheValue == xegpu::CacheReadHint::UNCACHED) + cacheHint = 1; + else if (l3CacheValue == xegpu::CacheReadHint::CACHED) + cacheHint = 2; + } else if (l1CacheValue == xegpu::CacheReadHint::CACHED) { + if (l3CacheValue == xegpu::CacheReadHint::UNCACHED) + cacheHint = 3; + else if (l3CacheValue == xegpu::CacheReadHint::CACHED) + cacheHint = 4; + } else if (l1CacheValue == xegpu::CacheReadHint::STREAMING) { + if (l3CacheValue == xegpu::CacheReadHint::UNCACHED) + cacheHint = 5; + else if (l3CacheValue == xegpu::CacheReadHint::CACHED) + cacheHint = 6; + } else if (l1CacheValue == xegpu::CacheReadHint::READ_INVALIDATE) { + if (l3CacheValue == xegpu::CacheReadHint::CACHED) + cacheHint = 7; + } + } else { + auto l1CacheValue = + l1hint.has_value() ? l1hint.value() : xegpu::CacheWriteHint::UNCACHED; + auto l3CacheValue = + l3hint.has_value() ? l3hint.value() : xegpu::CacheWriteHint::UNCACHED; + if (l1CacheValue == xegpu::CacheWriteHint::UNCACHED) { + if (l3CacheValue == xegpu::CacheWriteHint::UNCACHED) + cacheHint = 1; + else if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) + cacheHint = 2; + } else if (l1CacheValue == xegpu::CacheWriteHint::WRITE_THROUGH) { + if (l3CacheValue == xegpu::CacheWriteHint::UNCACHED) + cacheHint = 3; + else if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) + cacheHint = 4; + } else if (l1CacheValue == xegpu::CacheWriteHint::STREAMING) { + if (l3CacheValue == xegpu::CacheWriteHint::UNCACHED) + cacheHint = 5; + else if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) + cacheHint = 6; + } else if (l1CacheValue == xegpu::CacheWriteHint::WRITE_BACK) { + if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) + cacheHint = 7; + } + } + return cacheHint; +} + +unsigned encodeOpcode(xegpu::AtomicRMWKind kind) { + unsigned encode = 0; + switch (kind) { + case xegpu::AtomicRMWKind::addf: + encode = 19; + break; + case xegpu::AtomicRMWKind::addi: + encode = 12; + break; + case xegpu::AtomicRMWKind::assign: + encode = 10; + break; + case xegpu::AtomicRMWKind::maxf: + encode = 22; + break; + case xegpu::AtomicRMWKind::maxs: + encode = 15; + break; + case xegpu::AtomicRMWKind::maxu: + encode = 17; + break; + case xegpu::AtomicRMWKind::minf: + encode = 21; + break; + case xegpu::AtomicRMWKind::mins: + encode = 14; + break; + case xegpu::AtomicRMWKind::minu: + encode = 16; + break; + // case xegpu::AtomicRMWKind::mulf: + // case xegpu::AtomicRMWKind::muli: + case xegpu::AtomicRMWKind::ori: + encode = 25; + break; + case xegpu::AtomicRMWKind::andi: + encode = 24; + break; + default: + assert(0 && "to be supported"); + break; + } + return encode; +} + void lookupOrInsertIntrinsic(ConversionPatternRewriter &rewriter, Operation *op, std::string name, FunctionType funcType) { auto funcAttr = StringAttr::get(rewriter.getContext(), name); @@ -121,8 +225,106 @@ class CreateNdDescToVCPattern : public OpConversionPattern { LogicalResult matchAndRewrite(CreateNdDescOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - rewriter.replaceOpWithNewOp( - op, rewriter.getI64Type(), adaptor.getSource()); + auto loc = op.getLoc(); + auto i32Type = rewriter.getI32Type(); + auto i64Type = rewriter.getI64Type(); + // payload + auto v4i32 = VectorType::get(4, i32Type); + auto v2i64 = VectorType::get(2, i64Type); + Value payLoad = rewriter.create(loc, v2i64); + auto createIntConstant = [&](Type type, unsigned value) { + auto attr = rewriter.getIntegerAttr(type, value); + return rewriter.create(loc, type, attr); + }; + auto base = rewriter.create(loc, i64Type, + adaptor.getSource()); + auto idx0 = createIntConstant(i32Type, 0); + payLoad = + rewriter.create(loc, payLoad, base, idx0); + auto tileType = op.getTensorDesc().getType(); + auto rank = tileType.getRank(); + if (rank == 2) { + payLoad = rewriter.create(loc, v4i32, payLoad); + auto createOffset = [&](unsigned idx) -> Value { + Value val; + if (ShapedType::isDynamic(op.getStaticOffsets()[idx])) { + val = op.getOffsets()[idx]; + val = rewriter.create(loc, i32Type, val); + } else { + val = createIntConstant(i32Type, op.getStaticOffsets()[idx]); + } + return val; + }; + auto offsetX = createOffset(1); + auto offsetY = createOffset(0); + auto idx2 = createIntConstant(i32Type, 2); + auto idx3 = createIntConstant(i32Type, 3); + payLoad = rewriter.create(loc, payLoad, + offsetX, idx2); + payLoad = rewriter.create(loc, payLoad, + offsetY, idx3); + payLoad = rewriter.create(loc, v2i64, payLoad); + } + rewriter.replaceOp(op, payLoad); + return success(); + } +}; + +class UpdateNDOffsetToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(UpdateNDOffsetOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto desc = adaptor.getTensorDesc(); + auto i32Type = rewriter.getI32Type(); + auto v4i32 = VectorType::get(4, i32Type); + auto v2i64 = VectorType::get(2, rewriter.getI64Type()); + Value cast = rewriter.create(loc, v4i32, desc); + auto offsets = adaptor.getOffsets(); + for (auto i = 0; i < offsets.size(); i++) { + auto offset = offsets[i]; + if (auto cst = dyn_cast(offset.getDefiningOp())) + if (auto attr = dyn_cast(cst.getValue()); + attr && attr.getInt() == 0) + continue; + auto idx2 = rewriter.create( + loc, i32Type, rewriter.getIntegerAttr(i32Type, 2)); + auto idx3 = rewriter.create( + loc, i32Type, rewriter.getIntegerAttr(i32Type, 3)); + Value idx = i == 0 ? idx3 : idx2; + auto oldOffset = + rewriter.create(loc, cast, idx); + offset = rewriter.create(loc, i32Type, offset); + auto newOffset = + rewriter.create(loc, i32Type, oldOffset, offset); + cast = rewriter.create(loc, v4i32, cast, + newOffset, idx); + } + rewriter.replaceOpWithNewOp(op, v2i64, cast); + return success(); + } +}; + +class CreateDescToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(CreateDescOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto i32Type = rewriter.getI32Type(); + auto i64Type = rewriter.getI64Type(); + auto v2i64 = VectorType::get(2, i64Type); + Value payLoad = rewriter.create(loc, v2i64); + auto base = rewriter.create(loc, i64Type, + adaptor.getSource()); + auto idx0 = rewriter.create( + loc, i32Type, rewriter.getIntegerAttr(i32Type, 0)); + payLoad = + rewriter.create(loc, payLoad, base, idx0); + rewriter.replaceOp(op, payLoad); return success(); } }; @@ -133,8 +335,9 @@ class LoadStorePrefetchNdToLsc : public OpConversionPattern { LogicalResult matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - assert(op.getTensorDesc().getType().getShape().size() == 2 && - "only support 2d load/store/prefetch for now"); + auto tileType = op.getTensorDesc().getType(); + int rank = tileType.getRank(); + assert(rank <= 2 && "only support 1d/2d load/store/prefetch for now"); auto loc = op.getLoc(); ::mlir::VectorType vecType; std::string funcName; @@ -142,19 +345,23 @@ class LoadStorePrefetchNdToLsc : public OpConversionPattern { constexpr bool isPrefetch = std::is_same_v; if constexpr (isLoad) { vecType = cast(op.getResult().getType()); - funcName = "llvm_genx_lsc_load2d_stateless_"; + funcName = rank == 2 ? "llvm_genx_lsc_load2d_stateless_" + : "llvm_genx_lsc_load_stateless_"; } else if constexpr (isPrefetch) { vecType = VectorType::get({8, 16}, rewriter.getF32Type()); - funcName = "llvm_genx_lsc_prefetch2d_stateless_i1_i64"; + funcName = rank == 2 ? "llvm_genx_lsc_prefetch2d_stateless_i1_i64" + : "llvm_genx_lsc_prefetch_stateless_"; } else { vecType = cast(op.getValue().getType()); - funcName = "llvm_genx_lsc_store2d_stateless_i1_i64_"; + funcName = rank == 2 ? "llvm_genx_lsc_store2d_stateless_i1_i64_" + : "llvm_genx_lsc_store_stateless_i1_i64_"; } auto createIntConstant = [&](Type type, unsigned value) { auto attr = rewriter.getIntegerAttr(type, value); return rewriter.create(loc, type, attr); }; auto i8Type = rewriter.getI8Type(); + auto i16Type = rewriter.getI16Type(); auto i32Type = rewriter.getI32Type(); auto vnni = false; auto transpose = false; @@ -176,52 +383,87 @@ class LoadStorePrefetchNdToLsc : public OpConversionPattern { createIntConstant(i8Type, l1hint.has_value() ? (int)l1hint.value() : 0); auto l3CacheHint = createIntConstant(i8Type, l3hint.has_value() ? (int)l3hint.value() : 0); - unsigned cst = encodeDataum(vecType.getElementType()); - auto dataum = createIntConstant(i8Type, cst); + unsigned dataSize = encodeDataum(vecType.getElementType()); + auto dataum = createIntConstant(i8Type, dataSize); auto trans = createIntConstant(i8Type, transpose ? 2 : 1); // number of blocks(1 for now) auto nBlks = createIntConstant(i8Type, 1); - auto tensorType = op.getTensorDesc().getType(); - auto blockWidth = tensorType.getShape()[1]; - auto blockHeight = tensorType.getShape()[0]; - auto blockW = createIntConstant(i32Type, blockWidth); - auto blockH = createIntConstant(i32Type, blockHeight); - auto transform = createIntConstant(i8Type, vnni ? 1 : 0); - auto base = adaptor.getTensorDesc(); - // static memref for now - auto createDescOp = - op.getTensorDesc().template getDefiningOp(); - auto memType = cast(createDescOp.getSource().getType()); - unsigned bitWidth = memType.getElementType().getIntOrFloatBitWidth(); - auto surfaceWidth = memType.getShape()[1] * (bitWidth / 8) - 1; - auto surfaceHeight = memType.getShape()[0] - 1; - // pitch = width for now - auto surfacePitch = surfaceWidth; - auto surfaceW = createIntConstant(i32Type, surfaceWidth); - auto surfaceH = createIntConstant(i32Type, surfaceHeight); - auto surfaceP = createIntConstant(i32Type, surfacePitch); - auto createOffset = [&](unsigned idx) -> Value { - Value val; - if (ShapedType::isDynamic(createDescOp.getStaticOffsets()[idx])) { - val = createDescOp.getOffsets()[idx]; - val = rewriter.create(loc, i32Type, val); - } else { - val = createIntConstant(i32Type, createDescOp.getStaticOffsets()[idx]); - } - return val; - }; - auto offsetX = createOffset(1); - auto offsetY = createOffset(0); - - SmallVector args{pred, l1CacheHint, l3CacheHint, dataum, - trans, nBlks, blockW, blockH, - transform, base, surfaceW, surfaceH, - surfaceP, offsetX, offsetY}; + auto tensorDesc = adaptor.getTensorDesc(); + auto idx0 = createIntConstant(i32Type, 0); + auto base = + rewriter.create(loc, tensorDesc, idx0); std::string typeStr; VectorType newType; - std::tie(typeStr, newType) = encodeVectorType(rewriter, vecType); - if constexpr (!isLoad && !isPrefetch) { - args.push_back(adaptor.getValue()); + std::tie(typeStr, newType) = encodeVectorType(rewriter, vecType, rank == 1); + SmallVector args; + if (rank == 2) { + auto blockWidth = tileType.getShape()[1]; + auto blockHeight = tileType.getShape()[0]; + auto blockW = createIntConstant(i32Type, blockWidth); + auto blockH = createIntConstant(i32Type, blockHeight); + auto transform = createIntConstant(i8Type, vnni ? 1 : 0); + // static memref for now + auto createDescOp = + op.getTensorDesc().template getDefiningOp(); + auto memType = cast(createDescOp.getSource().getType()); + unsigned bitWidth = memType.getElementType().getIntOrFloatBitWidth(); + auto surfaceWidth = memType.getShape()[1] * (bitWidth / 8) - 1; + auto surfaceHeight = memType.getShape()[0] - 1; + // pitch = width for now + auto surfacePitch = surfaceWidth; + auto surfaceW = createIntConstant(i32Type, surfaceWidth); + auto surfaceH = createIntConstant(i32Type, surfaceHeight); + auto surfaceP = createIntConstant(i32Type, surfacePitch); + auto v4i32 = VectorType::get(4, i32Type); + tensorDesc = rewriter.create(loc, v4i32, tensorDesc); + auto idx2 = createIntConstant(i32Type, 2); + auto idx3 = createIntConstant(i32Type, 3); + auto offsetX = + rewriter.create(loc, tensorDesc, idx2); + auto offsetY = + rewriter.create(loc, tensorDesc, idx3); + args.assign({pred, l1CacheHint, l3CacheHint, dataum, trans, nBlks, blockW, + blockH, transform, base, surfaceW, surfaceH, surfaceP, + offsetX, offsetY}); + if constexpr (!isLoad && !isPrefetch) { + args.push_back(adaptor.getValue()); + } + } else if (rank == 1) { + auto subOpcode = + createIntConstant(i8Type, (isLoad || isPrefetch) ? 0 : 4); + auto addrScale = createIntConstant(i16Type, 1); + auto immOffset = createIntConstant(i32Type, 0); + auto dataumSize = createIntConstant(i8Type, 4); + int lscVecSize = 0; + int numElts = newType.getNumElements(); + if (numElts <= 4) { + lscVecSize = numElts; + } else { + lscVecSize = log2(numElts) + 2; + } + auto vecSize = createIntConstant(i8Type, lscVecSize); + auto transposed = createIntConstant(i8Type, 2); // transpose + auto mask = createIntConstant(i8Type, 0); + auto surface = createIntConstant(i32Type, 0); + args.assign({ + pred, + subOpcode, + l1CacheHint, + l3CacheHint, + addrScale, + immOffset, + dataumSize, + vecSize, + transposed, + mask, + base, + }); + if constexpr (!isLoad && !isPrefetch) { + auto cast = + rewriter.create(loc, newType, adaptor.getValue()); + args.push_back(cast); + } + args.push_back(surface); } if constexpr (!isPrefetch) funcName += typeStr; @@ -234,19 +476,41 @@ class LoadStorePrefetchNdToLsc : public OpConversionPattern { lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); auto funcOp = rewriter.create(loc, retType, funcName, args); - rewriter.replaceOp(op, funcOp); + if (rank == 2) { + rewriter.replaceOp(op, funcOp); + } else { + auto cast = rewriter.create(loc, op.getType(), + funcOp->getResult(0)); + rewriter.replaceOp(op, cast); + } } else { auto funcType = rewriter.getFunctionType(ValueRange(args).getTypes(), {}); Operation *opPtr = op; lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); - auto funcOp = rewriter.create(loc, TypeRange(), - funcName, args); + rewriter.create(loc, TypeRange(), funcName, args); rewriter.eraseOp(op); } return success(); } }; +xegpu::CreateNdDescOp findDescOp(mlir::Value val) { + if (auto op = val.getDefiningOp()) { + if (auto descOp = dyn_cast(op)) { + return descOp; + } else if (auto update = dyn_cast(op)) { + return findDescOp(update.getTensorDesc()); + } + } else if (auto arg = dyn_cast(val)) { + auto ownerOp = arg.getOwner()->getParentOp(); + auto forOp = cast(ownerOp); + auto init = forOp.getInits()[arg.getArgNumber() - 1]; + return findDescOp(init); + } else { + assert(0 && "add more support"); + } +} + template class LoadStorePrefetchNdToRawSend : public OpConversionPattern { public: @@ -254,8 +518,9 @@ class LoadStorePrefetchNdToRawSend : public OpConversionPattern { LogicalResult matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - assert(op.getTensorDesc().getType().getShape().size() == 2 && - "only support 2d load/store/prefetch for now"); + auto tileType = op.getTensorDesc().getType(); + auto rank = tileType.getRank(); + assert(rank <= 2 && "only support 1d/2d load/store/prefetch for now"); auto loc = op->getLoc(); constexpr bool isLoad = std::is_same_v; constexpr bool isPrefetch = std::is_same_v; @@ -267,7 +532,6 @@ class LoadStorePrefetchNdToRawSend : public OpConversionPattern { /// collect common info auto i1Type = rewriter.getI1Type(); auto i8Type = rewriter.getI8Type(); - auto i16Type = rewriter.getI16Type(); auto i32Type = rewriter.getI32Type(); auto i64Type = rewriter.getI64Type(); auto vnni = false; @@ -280,14 +544,7 @@ class LoadStorePrefetchNdToRawSend : public OpConversionPattern { ? true : false; } - auto l1hint = op.getL1Hint(); - // auto l2hint = op.getL2Hint(); - auto l3hint = op.getL3Hint(); - auto tileType = op.getTensorDesc().getType(); - auto blockWidth = tileType.getShape()[1]; - auto blockHeight = tileType.getShape()[0]; auto elmType = tileType.getElementType(); - auto base = adaptor.getTensorDesc(); VectorType newType = VectorType::get(1, i32Type); std::string funcName; if constexpr (isPrefetch) { @@ -302,83 +559,11 @@ class LoadStorePrefetchNdToRawSend : public OpConversionPattern { funcName = "llvm_genx_raw_sends2_noresult_i1_v8i32_"; } std::string typeStr; - std::tie(typeStr, newType) = encodeVectorType(rewriter, vecType); + std::tie(typeStr, newType) = + encodeVectorType(rewriter, vecType, rank == 1); funcName += typeStr; } - auto createDescOp = - op.getTensorDesc().template getDefiningOp(); - // fixme: support memref for now - auto memType = cast(createDescOp.getSource().getType()); - unsigned bitWidth = memType.getElementType().getIntOrFloatBitWidth(); - auto surfaceWidth = memType.getShape()[1] * (bitWidth / 8) - 1; - auto surfaceHeight = memType.getShape()[0] - 1; - // fixme: pitch = width for now - auto surfacePitch = surfaceWidth; - auto surfaceW = createIntConstant(i32Type, surfaceWidth); - auto surfaceH = createIntConstant(i32Type, surfaceHeight); - auto surfaceP = createIntConstant(i32Type, surfacePitch); - auto createOffset = [&](unsigned idx) -> Value { - Value val; - if (ShapedType::isDynamic(createDescOp.getStaticOffsets()[idx])) { - val = createDescOp.getOffsets()[idx]; - val = rewriter.create(loc, i32Type, val); - } else { - val = createIntConstant(i32Type, createDescOp.getStaticOffsets()[idx]); - } - return val; - }; - auto offsetX = createOffset(1); - auto offsetY = createOffset(0); - int cacheHint = 1; - if constexpr (isLoad || isPrefetch) { - auto l1CacheValue = - l1hint.has_value() ? l1hint.value() : xegpu::CacheReadHint::UNCACHED; - auto l3CacheValue = - l3hint.has_value() ? l3hint.value() : xegpu::CacheReadHint::UNCACHED; - if (l1CacheValue == xegpu::CacheReadHint::UNCACHED) { - if (l3CacheValue == xegpu::CacheReadHint::UNCACHED) - cacheHint = 1; - else if (l3CacheValue == xegpu::CacheReadHint::CACHED) - cacheHint = 2; - } else if (l1CacheValue == xegpu::CacheReadHint::CACHED) { - if (l3CacheValue == xegpu::CacheReadHint::UNCACHED) - cacheHint = 3; - else if (l3CacheValue == xegpu::CacheReadHint::CACHED) - cacheHint = 4; - } else if (l1CacheValue == xegpu::CacheReadHint::STREAMING) { - if (l3CacheValue == xegpu::CacheReadHint::UNCACHED) - cacheHint = 5; - else if (l3CacheValue == xegpu::CacheReadHint::CACHED) - cacheHint = 6; - } else if (l1CacheValue == xegpu::CacheReadHint::READ_INVALIDATE) { - if (l3CacheValue == xegpu::CacheReadHint::CACHED) - cacheHint = 7; - } - } else { - auto l1CacheValue = - l1hint.has_value() ? l1hint.value() : xegpu::CacheWriteHint::UNCACHED; - auto l3CacheValue = - l3hint.has_value() ? l3hint.value() : xegpu::CacheWriteHint::UNCACHED; - if (l1CacheValue == xegpu::CacheWriteHint::UNCACHED) { - if (l3CacheValue == xegpu::CacheWriteHint::UNCACHED) - cacheHint = 1; - else if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) - cacheHint = 2; - } else if (l1CacheValue == xegpu::CacheWriteHint::WRITE_THROUGH) { - if (l3CacheValue == xegpu::CacheWriteHint::UNCACHED) - cacheHint = 3; - else if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) - cacheHint = 4; - } else if (l1CacheValue == xegpu::CacheWriteHint::STREAMING) { - if (l3CacheValue == xegpu::CacheWriteHint::UNCACHED) - cacheHint = 5; - else if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) - cacheHint = 6; - } else if (l1CacheValue == xegpu::CacheWriteHint::WRITE_BACK) { - if (l3CacheValue == xegpu::CacheWriteHint::WRITE_BACK) - cacheHint = 7; - } - } + unsigned cacheHint = encodeCacheHint(op); /// fill in parameters for raw.send // bit[1:0] EOT,sendc @@ -387,48 +572,93 @@ class LoadStorePrefetchNdToRawSend : public OpConversionPattern { auto pred = createIntConstant(i1Type, 1); auto numSrc1 = createIntConstant(i8Type, 1); unsigned numDstVal = newType.getNumElements() / 16; + if (rank == 1) { + numDstVal *= 2; + } auto numDst = createIntConstant(i8Type, numDstVal); // 15 for ugm auto sfid = createIntConstant(i8Type, 15); auto extMsg = createIntConstant(i32Type, 0); // message descriptor - // https://gfxspecs.intel.com/Predator/Home/Index/53680 - uint32_t rawSendMsg = (isLoad || isPrefetch) ? 3 : 7; - rawSendMsg |= (vnni ? 1 : 0) << 7; - rawSendMsg |= (encodeDataum(elmType) - 1) << 9; - rawSendMsg |= (transpose ? 1 : 0) << 15; - rawSendMsg |= cacheHint << 17; - rawSendMsg |= (isLoad ? numDstVal : 0) << 20; - rawSendMsg |= 1 << 25; + uint32_t rawSendMsg = 0; + if (rank == 2) { + // https://gfxspecs.intel.com/Predator/Home/Index/53680 + rawSendMsg |= (isLoad || isPrefetch) ? 3 : 7; + rawSendMsg |= (vnni ? 1 : 0) << 7; + rawSendMsg |= (encodeDataum(elmType) - 1) << 9; + rawSendMsg |= (transpose ? 1 : 0) << 15; + rawSendMsg |= cacheHint << 17; + rawSendMsg |= (isLoad ? numDstVal : 0) << 20; + rawSendMsg |= 1 << 25; + } else { + // rank == 1 + // https://gfxspecs.intel.com/Predator/Home/Index/53523 + rawSendMsg |= (isLoad || isPrefetch) ? 0 : 4; + rawSendMsg |= 3 << 7; + rawSendMsg |= 3 << 9; + rawSendMsg |= int(log2(newType.getNumElements()) + 1) << 12; + rawSendMsg |= 1 << 15; + rawSendMsg |= cacheHint << 17; + rawSendMsg |= (isLoad ? 2 * numDstVal : 0) << 20; + rawSendMsg |= 1 << 25; + } auto msg = createIntConstant(i32Type, rawSendMsg); // payload + auto insertPoint = rewriter.saveInsertionPoint(); + CreateNdDescOp createDescOp = findDescOp(op.template getTensorDesc()); + rewriter.setInsertionPointAfter(createDescOp); auto v8i32 = VectorType::get(8, i32Type); auto v4i64 = VectorType::get(4, i64Type); Value payLoad = rewriter.create(loc, v4i64); auto idx0 = createIntConstant(i32Type, 0); - auto idx2 = createIntConstant(i32Type, 2); - auto idx3 = createIntConstant(i32Type, 3); - auto idx4 = createIntConstant(i32Type, 4); - auto idx5 = createIntConstant(i32Type, 5); - auto idx6 = createIntConstant(i32Type, 6); - auto idx7 = createIntConstant(i32Type, 7); + auto desc = rewriter.getRemappedValue(createDescOp); + auto base = rewriter.create(loc, desc, idx0); payLoad = rewriter.create(loc, payLoad, base, idx0); payLoad = rewriter.create(loc, v8i32, payLoad); - payLoad = rewriter.create(loc, payLoad, - surfaceW, idx2); - payLoad = rewriter.create(loc, payLoad, - surfaceH, idx3); - payLoad = rewriter.create(loc, payLoad, - surfaceP, idx4); - payLoad = rewriter.create(loc, payLoad, - offsetX, idx5); - payLoad = rewriter.create(loc, payLoad, - offsetY, idx6); - unsigned blockVal = ((blockHeight - 1) << 8) | (blockWidth - 1); - auto blockInfo = createIntConstant(i32Type, blockVal); - payLoad = rewriter.create(loc, payLoad, - blockInfo, idx7); + if (rank == 2) { + auto idx2 = createIntConstant(i32Type, 2); + auto idx3 = createIntConstant(i32Type, 3); + auto idx4 = createIntConstant(i32Type, 4); + auto idx5 = createIntConstant(i32Type, 5); + auto idx6 = createIntConstant(i32Type, 6); + auto idx7 = createIntConstant(i32Type, 7); + auto blockWidth = tileType.getShape()[1]; + auto blockHeight = tileType.getShape()[0]; + // fixme: support memref for now + auto memType = cast(createDescOp.getSource().getType()); + unsigned bitWidth = memType.getElementType().getIntOrFloatBitWidth(); + auto surfaceWidth = memType.getShape()[1] * (bitWidth / 8) - 1; + auto surfaceHeight = memType.getShape()[0] - 1; + // fixme: pitch = width for now + auto surfacePitch = surfaceWidth; + auto surfaceW = createIntConstant(i32Type, surfaceWidth); + auto surfaceH = createIntConstant(i32Type, surfaceHeight); + auto surfaceP = createIntConstant(i32Type, surfacePitch); + payLoad = rewriter.create(loc, payLoad, + surfaceW, idx2); + payLoad = rewriter.create(loc, payLoad, + surfaceH, idx3); + payLoad = rewriter.create(loc, payLoad, + surfaceP, idx4); + unsigned blockVal = ((blockHeight - 1) << 8) | (blockWidth - 1); + auto blockInfo = createIntConstant(i32Type, blockVal); + payLoad = rewriter.create(loc, payLoad, + blockInfo, idx7); + rewriter.restoreInsertionPoint(insertPoint); + auto v4i32 = VectorType::get(4, i32Type); + auto tensorDesc = adaptor.getTensorDesc(); + tensorDesc = rewriter.create(loc, v4i32, tensorDesc); + auto offsetX = + rewriter.create(loc, tensorDesc, idx2); + auto offsetY = + rewriter.create(loc, tensorDesc, idx3); + payLoad = rewriter.create(loc, payLoad, + offsetX, idx5); + payLoad = rewriter.create(loc, payLoad, + offsetY, idx6); + } + rewriter.restoreInsertionPoint(insertPoint); SmallVector args{modifier, execSize, pred, numSrc1, numDst, sfid, extMsg, msg, payLoad}; if constexpr (isLoad) { @@ -442,12 +672,25 @@ class LoadStorePrefetchNdToRawSend : public OpConversionPattern { lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); auto funcOp = rewriter.create(loc, retType, funcName, args); - rewriter.replaceOp(op, funcOp); + if (rank == 2) { + rewriter.replaceOp(op, funcOp); + } else { + auto cast = rewriter.create(loc, op.getType(), + funcOp->getResult(0)); + rewriter.replaceOp(op, cast); + } } else { if constexpr (isPrefetch) args.erase(args.begin() + 4); - else - args.push_back(adaptor.getValue()); + else { + if (rank == 2) { + args.push_back(adaptor.getValue()); + } else if (rank == 1) { + auto cast = rewriter.create(loc, newType, + adaptor.getValue()); + args.push_back(cast); + } + } auto funcType = rewriter.getFunctionType(ValueRange(args).getTypes(), {}); Operation *opPtr = op; lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); @@ -522,12 +765,473 @@ class DpasToVCPattern : public OpConversionPattern { return success(); } }; + +template +class GatherScatterToRawSend : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto tileType = op.getTensorDesc().getType(); + auto rank = tileType.getRank(); + assert(rank <= 2 && "only support 1d/2d for now"); + auto loc = op->getLoc(); + constexpr bool isLoad = std::is_same_v; + auto createIntConstant = [&](Type type, unsigned value) { + auto attr = rewriter.getIntegerAttr(type, value); + return rewriter.create(loc, type, attr); + }; + + /// collect common info + auto i1Type = rewriter.getI1Type(); + auto i8Type = rewriter.getI8Type(); + auto i32Type = rewriter.getI32Type(); + auto i64Type = rewriter.getI64Type(); + auto tensorDesc = adaptor.getTensorDesc(); + auto idx0 = createIntConstant(i32Type, 0); + auto base = + rewriter.create(loc, tensorDesc, idx0); + VectorType newType = VectorType::get(1, i32Type); + std::string funcName; + VectorType vecType; + if constexpr (isLoad) { + vecType = cast(op.getResult().getType()); + funcName = "llvm_genx_raw_send2_"; + } else { + vecType = cast(op.getValue().getType()); + funcName = "llvm_genx_raw_sends2_noresult_i1_v8i32_"; + } + std::string typeStr; + std::tie(typeStr, newType) = encodeVectorType(rewriter, vecType); + funcName += typeStr; + unsigned cacheHint = encodeCacheHint(op); + + /// fill in parameters for raw.send + // bit[1:0] EOT,sendc + auto modifier = createIntConstant(i8Type, 0); + auto execSize = createIntConstant(i8Type, 4); + auto pred = createIntConstant(i1Type, 1); + auto numSrc1 = createIntConstant(i8Type, 2); + unsigned numDstVal = newType.getNumElements() / 16; + auto numDst = createIntConstant(i8Type, numDstVal); + // 15 for ugm + auto sfid = createIntConstant(i8Type, 15); + auto extMsg = createIntConstant(i32Type, 0); + auto vecSize = 0; + if (numDstVal <= 4) { + vecSize = numDstVal - 1; + } else { + vecSize = log2(numDstVal) + 1; + } + // message descriptor + uint32_t rawSendMsg = 0; + rawSendMsg |= (isLoad) ? 0 : 4; + rawSendMsg |= 3 << 7; // A64 + rawSendMsg |= 2 << 9; // D32 + rawSendMsg |= vecSize << 12; + rawSendMsg |= cacheHint << 17; + rawSendMsg |= (isLoad ? numDstVal : 0) << 20; + rawSendMsg |= 2 << 25; + auto msg = createIntConstant(i32Type, rawSendMsg); + // payload + auto v16i64 = VectorType::get(16, i64Type); + Value payLoad = rewriter.create(loc, v16i64); + payLoad = + rewriter.create(loc, payLoad, base, idx0); + SmallVector indices(16, 0); + payLoad = rewriter.create( + loc, v16i64, payLoad, payLoad, rewriter.getI32ArrayAttr(indices)); + auto createDescOp = + op.getTensorDesc().template getDefiningOp(); + auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets()); + payLoad = rewriter.create(loc, v16i64, payLoad, offsets); + SmallVector args{modifier, execSize, pred, numSrc1, numDst, + sfid, extMsg, msg, payLoad}; + if constexpr (isLoad) { + funcName += "_i1_v16i64"; + auto old = rewriter.create(loc, newType); + args.push_back(old); + auto retType = newType; + auto funcType = + rewriter.getFunctionType(ValueRange(args).getTypes(), retType); + Operation *opPtr = op; + lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); + auto funcOp = + rewriter.create(loc, retType, funcName, args); + auto castTy = this->getTypeConverter()->convertType(op.getType()); + auto cast = + rewriter.create(loc, castTy, funcOp->getResult(0)); + rewriter.replaceOp(op, cast); + } else { + Value data = adaptor.getValue(); + if (data.getType() != newType) { + data = rewriter.create(loc, newType, data); + } + args.push_back(data); + auto funcType = rewriter.getFunctionType(ValueRange(args).getTypes(), {}); + Operation *opPtr = op; + lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); + rewriter.create(loc, TypeRange(), funcName, args); + rewriter.eraseOp(op); + } + return success(); + } +}; + +class AtomicToLsc : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(AtomicRMWOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto tileType = op.getTensorDesc().getType(); + auto rank = tileType.getRank(); + assert(rank <= 2 && "only support 1d/2d for now"); + auto loc = op->getLoc(); + auto createIntConstant = [&](Type type, unsigned value) { + auto attr = rewriter.getIntegerAttr(type, value); + return rewriter.create(loc, type, attr); + }; + + /// collect common info + auto i1Type = rewriter.getI1Type(); + auto i8Type = rewriter.getI8Type(); + auto i16Type = rewriter.getI16Type(); + auto i32Type = rewriter.getI32Type(); + auto i64Type = rewriter.getI64Type(); + VectorType vecType = cast(op.getResult().getType()); + std::string funcName = "llvm_genx_lsc_xatomic_stateless_"; + auto [typeStr, newType] = encodeVectorType(rewriter, vecType, false, true); + funcName += typeStr; + + /// fill in parameters for lsc + auto v16i1 = VectorType::get(16, i1Type); + auto vecAttr = DenseElementsAttr::get(v16i1, true); + auto pred = rewriter.create(loc, v16i1, vecAttr); + auto subOpcode = createIntConstant(i8Type, encodeOpcode(op.getKind())); + auto l1CacheHint = createIntConstant(i8Type, 1); + auto l3CacheHint = createIntConstant(i8Type, 1); + auto addrScale = createIntConstant(i16Type, 1); + auto immOffset = createIntConstant(i32Type, 0); + unsigned dataSize = encodeDataum(vecType.getElementType()); + auto dataumSize = createIntConstant(i8Type, dataSize); + unsigned numDstVal = newType.getNumElements() / 16; + auto lscVecSize = 0; + if (numDstVal <= 4) { + lscVecSize = numDstVal; + } else { + lscVecSize = log2(numDstVal) + 2; + } + auto vecSize = createIntConstant(i8Type, lscVecSize); + auto transposed = createIntConstant(i8Type, 1); + auto mask = createIntConstant(i8Type, 0); + + auto tensorDesc = adaptor.getTensorDesc(); + auto idx0 = createIntConstant(i32Type, 0); + auto base = + rewriter.create(loc, tensorDesc, idx0); + // payload + auto v16i64 = VectorType::get(16, i64Type); + Value payLoad = rewriter.create(loc, v16i64); + payLoad = + rewriter.create(loc, payLoad, base, idx0); + SmallVector indices(16, 0); + payLoad = rewriter.create( + loc, v16i64, payLoad, payLoad, rewriter.getI32ArrayAttr(indices)); + auto createDescOp = + op.getTensorDesc().template getDefiningOp(); + auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets()); + payLoad = rewriter.create(loc, v16i64, payLoad, offsets); + // src + auto v16i32 = VectorType::get(16, i32Type); + Value undef = rewriter.create(loc, v16i32); + Value src0 = undef; + if (op.getValue()) { + src0 = op.getValue(); + if (src0.getType() != newType) { + src0 = rewriter.create(loc, newType, src0); + } + } + Value src1 = undef; + auto surface = createIntConstant(i32Type, 0); + SmallVector args{pred, subOpcode, l1CacheHint, l3CacheHint, + addrScale, immOffset, dataumSize, vecSize, + transposed, mask, payLoad, src0, + src1, surface, undef}; + funcName += "_v16i1_v16i64"; + auto retType = newType; + auto funcType = + rewriter.getFunctionType(ValueRange(args).getTypes(), retType); + Operation *opPtr = op; + lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); + auto funcOp = + rewriter.create(loc, retType, funcName, args); + auto castTy = this->getTypeConverter()->convertType(op.getType()); + auto cast = + rewriter.create(loc, castTy, funcOp->getResult(0)); + rewriter.replaceOp(op, cast); + return success(); + } +}; + +Value createConstantI32(Location loc, PatternRewriter &rewriter, int32_t v) { + auto i32ty = rewriter.getIntegerType(32); + return rewriter.create(loc, i32ty, + IntegerAttr::get(i32ty, v)); +} + +#define zext(...) rewriter.create(loc, __VA_ARGS__) +#define logic_shl(...) \ + rewriter.create(loc, __VA_ARGS__) +#define bitwise_or(...) rewriter.create(loc, __VA_ARGS__) +#define bitwise_and(...) rewriter.create(loc, __VA_ARGS__) +#define i32_val(...) createConstantI32(loc, rewriter, __VA_ARGS__) +#define i8_val(value) \ + rewriter.create(loc, rewriter.getIntegerType(8), \ + rewriter.getI8IntegerAttr(value)) +#define i1_val(value) \ + rewriter.create(loc, rewriter.getI1Type(), \ + rewriter.getBoolAttr(value)) + +class AllocNbarrierToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(AllocNbarrierOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + OpBuilder::InsertionGuard guard(rewriter); + auto func = op->getParentOfType(); + rewriter.setInsertionPointAfter(func); + rewriter.create( + op.getLoc(), func, spirv::ExecutionMode::NamedBarrierCountINTEL, + op.getNbarrierCount()); + rewriter.eraseOp(op); + return success(); + } +}; + +class CreateNbarrierToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(CreateNbarrierOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto nbarrier_id = op.getNbarrierId(); + auto nbarrier_role = op.getNbarrierRole(); + auto num_producers = op.getNumProducers(); + auto num_consumers = op.getNumConsumers(); + + auto i32Type = rewriter.getIntegerType(32); + auto v8i32Type = mlir::VectorType::get(8, i32Type); + + DenseElementsAttr constantData = DenseElementsAttr::get( + v8i32Type, ArrayRef(std::vector(1, 0))); + Value nbarrier_src = + rewriter.create(loc, v8i32Type, constantData); + + // payload format https://gfxspecs.intel.com/Predator/Home/Index/72064 + Value payload = zext(i32Type, nbarrier_id); + + Value payload_nbarrier_role = + logic_shl(i32Type, zext(i32Type, nbarrier_role), i32_val(14)); + payload = bitwise_or(i32Type, payload, payload_nbarrier_role); + + Value payload_num_producers = + logic_shl(i32Type, i32_val(num_producers), i32_val(16)); + payload = bitwise_or(i32Type, payload, payload_num_producers); + + Value payload_num_consumers = + logic_shl(i32Type, i32_val(num_consumers), i32_val(24)); + payload = bitwise_or(i32Type, payload, payload_num_consumers); + + nbarrier_src = rewriter.create( + loc, v8i32Type, nbarrier_src, payload, i32_val(2)); + rewriter.replaceOp(op, nbarrier_src); + + return success(); + } +}; + +class NbarrierArriveToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(NbarrierArriveOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto payload = op.getPayload(); + + std::string funcName = "llvm_genx_raw_send2_noresult_i1_v8i32"; + + // desc format + // https://github.com/intel-innersource/drivers.gpu.compute.vc-intrinsics/blob/cmc_experimental/GenXIntrinsics/include/llvm/GenXIntrinsics/Intrinsic_definitions.py#L4595 + Value modifier = i8_val(0); + Value exec_size = i8_val(0); + Value predicate = i1_val(1); + Value numsrc1 = i8_val(1); // register nums of payload + Value sfid = + i8_val(3); // https://gfxspecs.intel.com/Predator/Home/Index/47532 + Value etDesc = i32_val(0); + Value msg_desc = i32_val( + 0x2000004); // https://gfxspecs.intel.com/Predator/Home/Index/53524 + + SmallVector args{modifier, exec_size, predicate, numsrc1, + sfid, etDesc, msg_desc, payload}; + + auto funcType = rewriter.getFunctionType(ValueRange(args).getTypes(), {}); + + Operation *opPtr = op; + lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); + rewriter.create(loc, TypeRange(), funcName, args); + + rewriter.eraseOp(op); + return success(); + } +}; + +class NbarrierWaitToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(NbarrierWaitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto payload = op.getPayload(); + + auto i8Type = rewriter.getIntegerType(8); + auto i32Type = rewriter.getIntegerType(32); + auto nbarrier_src = rewriter.create( + loc, i32Type, payload, i32_val(2)); + auto nbarrier_id = + zext(i8Type, bitwise_and(i32Type, nbarrier_src, i32_val(0xFF))); + + Value signal_flag = i8_val(0); // 0b0: wait 0b1: signal + Value num_threads = i8_val(0); // This field is ignored for nbarrier.wait + + std::string funcName = "llvm_genx_nbarrier"; + SmallVector args{signal_flag, nbarrier_id, num_threads}; + + auto funcType = rewriter.getFunctionType(ValueRange(args).getTypes(), {}); + + Operation *opPtr = op; + lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); + rewriter.create(loc, TypeRange(), funcName, args); + + rewriter.eraseOp(op); + return success(); + } +}; + +class CompilerHintToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(CompilerHintOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + std::string funcName = "llvm_genx_fence"; + Value fence_flag = i8_val(-128); + SmallVector args{fence_flag}; + auto funcType = rewriter.getFunctionType(ValueRange(args).getTypes(), {}); + + Operation *opPtr = op; + lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); + rewriter.create(loc, TypeRange(), funcName, args); + + rewriter.eraseOp(op); + return success(); + } +}; + +class MfenceToVCPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(MfenceOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto pred = i1_val(1); + auto fence_op_attr = op.getFenceOpAttr().str(); + auto fence_scope_attr = op.getFenceScopeAttr().str(); + auto memory_kind_attr = op.getMemoryKindAttr().str(); + + std::vector lscFenceOp{"none", "evict", "invalidate", + "discard", "clean", "flushl3"}; + std::vector lscFenceScope{"group", "local", "tile", "gpu", + "gpus", "system", "sysacq"}; + std::vector lscSFID{"ugm", "ugml", "tgm", "slm"}; + + uint8_t fence_op, fence_scope, sfid; + + auto it = std::find(lscFenceOp.begin(), lscFenceOp.end(), fence_op_attr); + if (it != lscFenceOp.end()) { + fence_op = std::distance(lscFenceOp.begin(), it); + } else { + llvm_unreachable("unsupported value for lsc_fence_op attribute"); + } + + it = + std::find(lscFenceScope.begin(), lscFenceScope.end(), fence_scope_attr); + if (it != lscFenceScope.end()) { + fence_scope = std::distance(lscFenceScope.begin(), it); + } else { + llvm_unreachable("unsupported value for lsc_fence_scope attribute"); + } + + it = std::find(lscSFID.begin(), lscSFID.end(), memory_kind_attr); + if (it != lscSFID.end()) { + sfid = std::distance(lscSFID.begin(), it); + } else { + llvm_unreachable("unsupported value for memory_kind attribute"); + } + + SmallVector args{pred, i8_val(sfid), i8_val(fence_op), + i8_val(fence_scope)}; + auto funcType = rewriter.getFunctionType(ValueRange(args).getTypes(), {}); + + std::string funcName = "llvm.genx.lsc.fence.i1"; + + Operation *opPtr = op; + lookupOrInsertIntrinsic(rewriter, opPtr, funcName, funcType); + rewriter.create(loc, TypeRange(), funcName, args); + + rewriter.eraseOp(op); + return success(); + } +}; +/// add necessary vectorTospirv patterns (different from upstream) +struct VectorShapeCast final : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(vector::ShapeCastOp shapeCastOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type dstType = getTypeConverter()->convertType(shapeCastOp.getType()); + if (!dstType) + return failure(); + if (dstType == adaptor.getSource().getType() || + shapeCastOp.getResultVectorType().getNumElements() == 1) { + rewriter.replaceOp(shapeCastOp, adaptor.getSource()); + return success(); + } + rewriter.replaceOpWithNewOp(shapeCastOp, dstType, + adaptor.getSource()); + return success(); + } +}; } // namespace void imex::populateXeGPUToVCIntrinsicsPatterns( SPIRVTypeConverter &typeConverter, RewritePatternSet &patterns) { - patterns.add(typeConverter, - patterns.getContext()); + patterns.add, + GatherScatterToRawSend, AtomicToLsc, + UpdateNDOffsetToVCPattern>(typeConverter, patterns.getContext()); if (getenv("IMEX_NOT_PREFER_RAWSEND")) patterns.add, LoadStorePrefetchNdToLsc, diff --git a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index f9ef3a0d8..6e55c2574 100644 --- a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -457,9 +457,9 @@ void CreateDescOp::print(::mlir::OpAsmPrinter &printer) { } mlir::LogicalResult CreateDescOp::verify() { - if (getRankOf(getSource()) > 1) + if (getRankOf(getSource()) > 2) return emitOpError( - "Expecting the source is a 1D memref or pointer (uint64_t)."); + "Expecting the source is a 2D/1D memref or pointer (uint64_t)."); std::vector shape; @@ -471,9 +471,10 @@ mlir::LogicalResult CreateDescOp::verify() { if (llvm::isa(offsetTy)) { shape = llvm::dyn_cast(offsetTy).getShape().vec(); - if (shape.size() != 1) - return emitOpError("Expecting the offset is either a 1D vector (for VC) " - "or scalar (for SIMT)."); + if (shape.size() > 2) + return emitOpError( + "Expecting the offset is either a 2D/1D vector (for VC) " + "or scalar (for SIMT)."); } if (offsetTy.isIndex() || chunkSize != 1) { @@ -572,9 +573,9 @@ mlir::LogicalResult LoadNDOp::verify() { auto tdescTy = getTensorDesc().getType(); auto valueTy = llvm::dyn_cast(getValue().getType()); - if (tdescTy.getRank() != 2) + if (tdescTy.getRank() > 2) return emitOpError( - "The TensorDesc for LoadNDOp should be a 2D TensorDesc."); + "The TensorDesc for LoadNDOp should be a 2D/1D TensorDesc."); if (!valueTy) return emitOpError("Invalid result, it should be a VectorType.\n"); @@ -586,7 +587,8 @@ mlir::LogicalResult LoadNDOp::verify() { return emitOpError( "Value should have the same element type as TensorDesc."); - { // TODO: The following logic are architecture dependent, pending to be moved + if (tdescTy.getRank() == 2) { // TODO: The following logic are architecture + // dependent, pending to be moved // out auto width = tdescTy.getShape()[1]; auto height = tdescTy.getShape()[0]; @@ -789,7 +791,7 @@ mlir::LogicalResult StoreNDOp::verify() { auto dstTy = getTensorDesc().getType(); // Tile auto valTy = llvm::dyn_cast(getValue().getType()); // Vector - if (dstTy.getRank() != 2) + if (dstTy.getRank() > 2) return emitOpError( "The TensorDesc for StoreNdOp should be a 2D TensorDesc."); @@ -804,7 +806,8 @@ mlir::LogicalResult StoreNDOp::verify() { "the elem type of memory (dst) shape.\n"); } - { // TODO: The following logic are architecture dependent, pending to be moved + if (dstTy.getRank() == 2) { // TODO: The following logic are architecture + // dependent, pending to be moved // out auto width = dstTy.getShape()[1]; auto height = dstTy.getShape()[0]; @@ -900,7 +903,6 @@ mlir::LogicalResult StoreNDOp::verify() { "In SIMT mode, the value (vector) shape doesn't match the memory" "(dst) shape as derived according to the mapping rule.\n"); } - return mlir::success(); } diff --git a/test/Conversion/XeGPUToSPIRV/atomic_basic.mlir b/test/Conversion/XeGPUToSPIRV/atomic_basic.mlir new file mode 100644 index 000000000..5e0122015 --- /dev/null +++ b/test/Conversion/XeGPUToSPIRV/atomic_basic.mlir @@ -0,0 +1,30 @@ +// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK +module @gemm attributes {gpu.container_module} { + memref.global "private" @__constant_8x16xf32 : memref<8x16xf32> = dense<4.000000e-01> + func.func @test(%arg0: memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %arg0, %memref : memref<8x16xf32> to memref<8x16xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>) + return %memref : memref<8x16xf32> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // CHECK: llvm_genx_lsc_xatomic_stateless_v16i32_v16i1_v16i64 + %mask = arith.constant dense : vector<16xi1> + %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex> + %1 = arith.constant dense<0.5> : vector<16xf32> + %2 = xegpu.create_tdesc %arg0, %offsets {chunk_size_per_lane = 1} : memref<8x16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> + %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32> + gpu.return + } + } + func.func @main() attributes {llvm.emit_c_interface} { + %0 = memref.get_global @__constant_8x16xf32 : memref<8x16xf32> + %2 = call @test(%0) : (memref<8x16xf32>) -> (memref<8x16xf32>) + %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> + // call @printMemrefF32(%cast) : (memref<*xf32>) -> () + return + } + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} +} diff --git a/test/Conversion/XeGPUToSPIRV/barrier_basic.mlir b/test/Conversion/XeGPUToSPIRV/barrier_basic.mlir new file mode 100644 index 000000000..e95c9eb78 --- /dev/null +++ b/test/Conversion/XeGPUToSPIRV/barrier_basic.mlir @@ -0,0 +1,44 @@ +// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s +module @gemm attributes {gpu.container_module} { + memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> + memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> + func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<8x16xf16> + memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16> + %memref_0 = gpu.alloc host_shared () : memref<16x16xf16> + memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16> + %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf16> + gpu.dealloc %memref_0 : memref<16x16xf16> + return %memref_1 : memref<8x16xf32> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32 + // CHECK: spirv.FunctionCall @llvm.genx.lsc.fence.i1 + // CHECK: spirv.FunctionCall @llvm_genx_fence + // CHECK: spirv.FunctionCall @llvm_genx_nbarrier + // CHECK: spirv.ExecutionMode @test_kernel "NamedBarrierCountINTEL", 16 + xegpu.alloc_nbarrier 16 + %nbarrier_id = arith.constant 1 : i8 + %nbarrier_role = arith.constant 0 : i8 + %payload = xegpu.create_nbarrier %nbarrier_id, %nbarrier_role {num_producers = 32 : i8, num_consumers = 32 : i8} : (i8, i8) -> vector<8xi32> + xegpu.nbarrier_arrive %payload : vector<8xi32> + xegpu.mfence {memory_kind = "ugm" , fence_op = "none", fence_scope = "local"} + xegpu.compiler_hint + xegpu.nbarrier_wait %payload : vector<8xi32> + gpu.return + } + } + func.func @main() attributes {llvm.emit_c_interface} { + %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16> + %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16> + %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> + %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> + //call @printMemrefF32(%cast) : (memref<*xf32>) -> () + return + } + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} +} diff --git a/test/Conversion/XeGPUToVC/gemm_basic.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir similarity index 92% rename from test/Conversion/XeGPUToVC/gemm_basic.mlir rename to test/Conversion/XeGPUToSPIRV/gemm_basic.mlir index 47716f446..a4c12ec46 100644 --- a/test/Conversion/XeGPUToVC/gemm_basic.mlir +++ b/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir @@ -23,6 +23,11 @@ module @gemm attributes {gpu.container_module} { // LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v128i32_i1_i64 // LSC: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32 // LSC: spirv.FunctionCall @llvm_genx_lsc_store2d_stateless_i1_i64_v128f32 + // CHECK: %[[BASE:.*]] = spirv.ConvertPtrToU %arg0 : !spirv.ptr, CrossWorkgroup> to i64 + // CHECK: %[[BASE1:.*]] = spirv.VectorInsertDynamic %[[BASE]] + // CHECK: %[[BASE2:.*]] = spirv.Bitcast %[[BASE1]] + // CHECK: spirv.VectorInsertDynamic + // CHECK: spirv.VectorInsertDynamic // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32 // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32 // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v64i32_i1_v8i32 diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.mlir new file mode 100644 index 000000000..39569d345 --- /dev/null +++ b/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.mlir @@ -0,0 +1,49 @@ +// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK-RAW +// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK-LSC +module @gemm attributes {gpu.container_module} { + memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> + memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> + func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<8x16xf16> + memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16> + %memref_0 = gpu.alloc host_shared () : memref<16x16xf16> + memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16> + %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf16> + gpu.dealloc %memref_0 : memref<16x16xf16> + return %memref_1 : memref<8x16xf32> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // CHECK-RAW: %[[BASE:.*]] = spirv.ConvertPtrToU %arg0 : !spirv.ptr, CrossWorkgroup> to i64 + // CHECK-RAW: spirv.VectorInsertDynamic %[[BASE]] + // CHECK-RAW: spirv.FunctionCall @llvm_genx_raw_send2_v32i64_i1_v8i32 + // CHECK-RAW: spirv.FunctionCall @llvm_genx_raw_sends2_noresult_i1_v8i32_v64i64 + // CHECK-LSC: spirv.FunctionCall @llvm_genx_lsc_load_stateless_v32i64_i1_i64 + // CHECK-LSC: spirv.FunctionCall @llvm_genx_lsc_store_stateless_i1_i64_v64i64 + %arg00 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [128], strides: [1] : memref<8x16xf16> to memref<128xf16> + %0 = xegpu.create_nd_tdesc %arg00[0] {mode = vc} : memref<128xf16> -> !xegpu.tensor_desc<128xf16> + %1 = xegpu.create_nd_tdesc %arg1[0, 0] {mode = vc, boundary_check = true} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %arg02 = memref.reinterpret_cast %arg2 to offset: [0], sizes: [128], strides: [1] : memref<8x16xf32> to memref<128xf32> + %2 = xegpu.create_nd_tdesc %arg02[0] {mode = vc} : memref<128xf32> -> !xegpu.tensor_desc<128xf32> + %3 = xegpu.load_nd %0 {mode = vc}: !xegpu.tensor_desc<128xf16> -> vector<128xf16> + %4 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + %6 = vector.shape_cast %3: vector<128xf16> to vector<8x8x2xf16> + %5 = xegpu.dpas %6, %4 {mode = vc}: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32> + xegpu.store_nd %7, %2 {mode = vc}: vector<128xf32>, !xegpu.tensor_desc<128xf32> + gpu.return + } + } + func.func @main() attributes {llvm.emit_c_interface} { + %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16> + %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16> + %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> + %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> + //call @printMemrefF32(%cast) : (memref<*xf32>) -> () + return + } + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} +} diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.mlir new file mode 100644 index 000000000..ee7986859 --- /dev/null +++ b/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.mlir @@ -0,0 +1,53 @@ +// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK-RAW +module @gemm attributes {gpu.container_module} { + memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> + memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> + func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<8x16xf16> + memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16> + %memref_0 = gpu.alloc host_shared () : memref<16x16xf16> + memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16> + %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf16> + gpu.dealloc %memref_0 : memref<16x16xf16> + return %memref_1 : memref<8x16xf32> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // CHECK-RAW: spirv.FunctionCall @llvm_genx_raw_send2_v64i32_i1_v16i64 + // CHECK-RAW: spirv.FunctionCall @llvm_genx_raw_send2_v128i32_i1_v8i32 + %arg00 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16, 8], strides: [8, 1] : memref<8x16xf16> to memref<16x8xf16> + %offsets = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> + %0 = xegpu.create_tdesc %arg00, %offsets {mode = vc, chunk_size_per_lane = 8} : memref<16x8xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scattered> + %cst = arith.constant dense : vector<128xi1> + %mask = vector.shape_cast %cst : vector<128xi1> to vector<16x8xi1> + %3 = xegpu.load %0, %mask {mode = vc}: !xegpu.tensor_desc<16x8xf16, #xegpu.scattered>, vector<16x8xi1> -> vector<16x8xf16> + %66 = vector.shape_cast %3: vector<16x8xf16> to vector<128xf16> + %6 = vector.shape_cast %66: vector<128xf16> to vector<8x8x2xf16> + + %1 = xegpu.create_nd_tdesc %arg1[0, 0] {mode = vc, boundary_check = true} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %4 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + + %5 = xegpu.dpas %6, %4 : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + + %arg02 = memref.reinterpret_cast %arg2 to offset: [0], sizes: [16, 8], strides: [8, 1] : memref<8x16xf32> to memref<16x8xf32> + %offsets2 = arith.constant dense<[0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480]> : vector<16xindex> + %2 = xegpu.create_tdesc %arg02, %offsets2 {mode = vc, chunk_size_per_lane = 8} : memref<16x8xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered> + %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32> + %8 = vector.shape_cast %7: vector<128xf32> to vector<16x8xf32> + xegpu.store %8, %2, %mask {mode = vc}: vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> + gpu.return + } + } + func.func @main() attributes {llvm.emit_c_interface} { + %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16> + %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16> + %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> + %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> + //call @printMemrefF32(%cast) : (memref<*xf32>) -> () + return + } + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} +} diff --git a/test/Conversion/XeGPUToSPIRV/update_offset.mlir b/test/Conversion/XeGPUToSPIRV/update_offset.mlir new file mode 100644 index 000000000..bb5905b13 --- /dev/null +++ b/test/Conversion/XeGPUToSPIRV/update_offset.mlir @@ -0,0 +1,47 @@ +// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s +module @gemm attributes {gpu.container_module} { + memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> + memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> + func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<8x16xf16> + memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16> + %memref_0 = gpu.alloc host_shared () : memref<16x16xf16> + memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16> + %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf16> + gpu.dealloc %memref_0 : memref<16x16xf16> + return %memref_1 : memref<8x16xf32> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + // CHECK: llvm_genx_raw_send2_v64i32_i1_v8i32 + // CHECK: llvm_genx_raw_send2_v64i32_i1_v8i32 + %0 = xegpu.create_nd_tdesc %arg1[0, 0] {mode = vc}: memref<16x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.load_nd %0 {vnni_axis = 1, mode = vc} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16> + %2 = xegpu.update_nd_offset %0, [%c8, %c0] {mode = vc}: !xegpu.tensor_desc<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %3 = xegpu.load_nd %2 {vnni_axis = 1, mode = vc} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16> + %lhs = vector.shape_cast %1 : vector<8x8x2xf16> to vector<128xf16> + %rhs = vector.shape_cast %3 : vector<8x8x2xf16> to vector<128xf16> + %add = arith.addf %lhs, %rhs: vector<128xf16> + %out = arith.extf %add : vector<128xf16> to vector<128xf32> + %cast = vector.shape_cast %out : vector<128xf32> to vector<8x16xf32> + %c = xegpu.create_nd_tdesc %arg2[0, 0] {mode = vc}: memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %cast, %c {mode = vc}: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return + } + } + func.func @main() attributes {llvm.emit_c_interface} { + %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16> + %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16> + %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> + %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%cast) : (memref<*xf32>) -> () + // : 2.19922, 2.19922, 2.19922, 2.19922 + return + } + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} +} diff --git a/test/Conversion/XeGPUToVC/xegpu-to-llvm.pp b/test/Conversion/XeGPUToSPIRV/xegpu-to-llvm.pp similarity index 100% rename from test/Conversion/XeGPUToVC/xegpu-to-llvm.pp rename to test/Conversion/XeGPUToSPIRV/xegpu-to-llvm.pp diff --git a/test/Conversion/XeGPUToVC/lit.local.cfg b/test/Conversion/XeGPUToVC/lit.local.cfg deleted file mode 100644 index 508faf2b0..000000000 --- a/test/Conversion/XeGPUToVC/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -local_excludes = ['gemm_1024x1024xf16.runnable.mlir'] - -config.excludes.update(local_excludes) diff --git a/test/Dialect/XeGPU/IR/atomic_rmw.mlir b/test/Dialect/XeGPU/IR/atomic_rmw.mlir index 9de90500b..5f4ea2919 100644 --- a/test/Dialect/XeGPU/IR/atomic_rmw.mlir +++ b/test/Dialect/XeGPU/IR/atomic_rmw.mlir @@ -9,8 +9,8 @@ func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : v %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> // CHECK: xegpu.atomic_rmw - // CHECK-SAME: (vector<16x1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>) - xegpu.atomic_rmw "addf" %value, %1, %mask {mode = vc} : (vector<16x1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>) + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> + xegpu.atomic_rmw "addf" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32> return } @@ -20,8 +20,8 @@ func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered> // CHECK: xegpu.atomic_rmw - // CHECK-SAME: (vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>) - xegpu.atomic_rmw "mulf" %value, %1, %mask {mode = vc} : (vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>) + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> + xegpu.atomic_rmw "mulf" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32> return } @@ -31,8 +31,8 @@ func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scattered> // CHECK: xegpu.atomic_rmw - // CHECK-SAME: (vector<16x2xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>) - xegpu.atomic_rmw "andi" %value, %1, %mask {mode = vc} : (vector<16x2xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>) + // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> + xegpu.atomic_rmw "andi" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32> return } diff --git a/test/Dialect/XeGPU/IR/barrier_ops.mlir b/test/Dialect/XeGPU/IR/barrier_ops.mlir index f475a84db..079e916b4 100644 --- a/test/Dialect/XeGPU/IR/barrier_ops.mlir +++ b/test/Dialect/XeGPU/IR/barrier_ops.mlir @@ -6,10 +6,8 @@ // CHECK-LABEL: func @alloc_nbarrier({{.*}}) { func.func @alloc_nbarrier() { - %c8_i8 = arith.constant 8 : i8 // CHECK: xegpu.alloc_nbarrier - // CHECK-SAME: : i8 - xegpu.alloc_nbarrier %c8_i8 : i8 + xegpu.alloc_nbarrier 8 return } diff --git a/test/Conversion/XeGPUToVC/gemm_1024x1024xf16.runnable.mlir b/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir similarity index 100% rename from test/Conversion/XeGPUToVC/gemm_1024x1024xf16.runnable.mlir rename to test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir diff --git a/test/Integration/Dialect/XeGPU/lit.local.cfg b/test/Integration/Dialect/XeGPU/lit.local.cfg new file mode 100644 index 000000000..50449795e --- /dev/null +++ b/test/Integration/Dialect/XeGPU/lit.local.cfg @@ -0,0 +1,3 @@ +local_excludes = ['gemm_1024x1024xf16.mlir'] + +config.excludes.update(local_excludes) diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp b/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp new file mode 100644 index 000000000..5a66cfc36 --- /dev/null +++ b/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp @@ -0,0 +1,17 @@ +builtin.module( + imex-convert-gpu-to-spirv + spirv.module(spirv-lower-abi-attrs + spirv-update-vce) + func.func(llvm-request-c-wrappers) + serialize-spirv + convert-gpu-to-gpux + convert-scf-to-cf + convert-cf-to-llvm + convert-arith-to-llvm + convert-func-to-llvm + convert-math-to-llvm + convert-gpux-to-llvm + expand-strided-metadata + lower-affine + finalize-memref-to-llvm + reconcile-unrealized-casts)