From 75ce2dc475b12c12419904cd67b9d12b79a66242 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 22 Jan 2025 09:00:02 -0800 Subject: [PATCH 001/208] [llvm][AArch64] apple-m4 does not have FEAT_{SPEv1p2,SEL2,MPAM} (#123827) This commit addresses some uncertainty raised in 84fa1755a5b7845ddaeaa513a3786013c76c9c88 as to which features Apple M4 has. --- .../test/Driver/print-enabled-extensions/aarch64-apple-m4.c | 5 ++++- llvm/lib/Target/AArch64/AArch64Processors.td | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-apple-m4.c b/clang/test/Driver/print-enabled-extensions/aarch64-apple-m4.c index 0e40c132bebdc8..b4101245273223 100644 --- a/clang/test/Driver/print-enabled-extensions/aarch64-apple-m4.c +++ b/clang/test/Driver/print-enabled-extensions/aarch64-apple-m4.c @@ -34,6 +34,8 @@ // CHECK-NEXT: FEAT_LRCPC2 Enable Armv8.4-A RCPC instructions with Immediate Offsets // CHECK-NEXT: FEAT_LSE Enable Armv8.1-A Large System Extension (LSE) atomic instructions // CHECK-NEXT: FEAT_LSE2 Enable Armv8.4-A Large System Extension 2 (LSE2) atomicity rules +// FIXME: Apple M4 does not have FEAT_MPAM, but it is currently marked as +// non-optional in llvm's understanding of Armv8.4-A // CHECK-NEXT: FEAT_MPAM Enable Armv8.4-A Memory system Partitioning and Monitoring extension // CHECK-NEXT: FEAT_NV, FEAT_NV2 Enable Armv8.4-A Nested Virtualization Enchancement // CHECK-NEXT: FEAT_PAN Enable Armv8.1-A Privileged Access-Never extension @@ -43,6 +45,8 @@ // CHECK-NEXT: FEAT_RAS, FEAT_RASv1p1 Enable Armv8.0-A Reliability, Availability and Serviceability Extensions // CHECK-NEXT: FEAT_RDM Enable Armv8.1-A Rounding Double Multiply Add/Subtract instructions // CHECK-NEXT: FEAT_SB Enable Armv8.5-A Speculation Barrier +// FIXME: Apple M4 does not have FEAT_SEL2, but it is currently marked as +// non-optional in llvm's understanding of Armv8.4-A // CHECK-NEXT: FEAT_SEL2 Enable Armv8.4-A Secure Exception Level 2 extension // CHECK-NEXT: FEAT_SHA1, FEAT_SHA256 Enable SHA1 and SHA256 support // CHECK-NEXT: FEAT_SHA3, FEAT_SHA512 Enable SHA512 and SHA3 support @@ -51,7 +55,6 @@ // CHECK-NEXT: FEAT_SME_F64F64 Enable Scalable Matrix Extension (SME) F64F64 instructions // CHECK-NEXT: FEAT_SME_I16I64 Enable Scalable Matrix Extension (SME) I16I64 instructions // CHECK-NEXT: FEAT_SPECRES Enable Armv8.5-A execution and data prediction invalidation instructions -// CHECK-NEXT: FEAT_SPEv1p2 Enable extra register in the Statistical Profiling Extension // CHECK-NEXT: FEAT_TLBIOS, FEAT_TLBIRANGE Enable Armv8.4-A TLB Range and Maintenance instructions // CHECK-NEXT: FEAT_TRF Enable Armv8.4-A Trace extension // CHECK-NEXT: FEAT_UAO Enable Armv8.2-A UAO PState diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 2de8d4637d3729..0e3c4e8397f526 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -929,8 +929,7 @@ def ProcessorFeatures { FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE, FeaturePAuth, FeatureFPAC, FeatureRAS, FeatureRCPC, FeatureRDM, - FeatureDotProd, FeatureMatMulInt8, - FeatureSPE_EEF]; + FeatureDotProd, FeatureMatMulInt8]; list ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeaturePerfMon, FeatureNEON, FeatureFPARMv8]; list ExynosM4 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd, From fa6f88af102cb79a0371725b487e929cb0bcfcb2 Mon Sep 17 00:00:00 2001 From: Petr Kurapov Date: Wed, 22 Jan 2025 18:03:36 +0100 Subject: [PATCH 002/208] =?UTF-8?q?[MLIR][XeGPU]=20Allow=20some=20nd=20ops?= =?UTF-8?q?=20to=20have=20argument=20shapes=20mismatch=20for=20=E2=80=A6?= =?UTF-8?q?=20(#120566)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …the distributed IR case. This patch allows `nd_load` and `nd_store` to preserve the tensor descriptor shape during distribution to SIMT. The validation now expects the distributed instruction to retain the `sg_map` attribute and uses it to verify the consistency. --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 3 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 83 ++++++++++++++----- mlir/test/Dialect/XeGPU/XeGPUOps.mlir | 24 ++++++ mlir/test/Dialect/XeGPU/invalid.mlir | 30 +++++-- 4 files changed, 112 insertions(+), 28 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index f5cf3dad75d9c2..a2bfa721f2515b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -327,8 +327,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor let hasVerifier = 1; } -def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>, - AllElementTypesMatch<["value", "TensorDesc"]>]> { +def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; let description = [{ diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 9d3c4366a7bd50..15c435f1fa257b 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -73,6 +73,39 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) { kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH; } +// Validations for nd instruction arguments is successful if any of these are +// true: +// - tensor descriptor and the output vector shapes exactly match. +// - tensor descriptor has a sg_map attribute and the distributed vector shape +// matches the tensor descriptor shape when scaled using sg_map factors on +// each dimension. +static bool isArgShapesValid(ArrayRef descShape, + ArrayRef valShape, SGMapAttr sgMap) { + if (descShape == valShape) { + if (!sgMap) + return true; + + // this can be relaxed if necessary by supporting non-2d shapes distribution + // until the constraints are defined this lives here instead of the tensor + // descriptor type. + return valShape.size() == sgMap.getWiLayout().size(); + } + + if (!sgMap) + return false; + + if (valShape.size() != descShape.size()) + return false; + + for (const auto &[factor, dim, expected] : + llvm::zip_equal(sgMap.getWiLayout(), valShape, descShape)) { + if (factor * dim != expected) + return false; + } + + return true; +} + //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// @@ -210,13 +243,13 @@ LogicalResult PrefetchNdOp::verify() { return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!isReadHintOrNone(getL1HintAttr())) - return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + return emitOpError("invalid l1_hint: ") << getL1HintAttr(); if (!isReadHintOrNone(getL2HintAttr())) - return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + return emitOpError("invalid l2_hint: ") << getL2HintAttr(); if (!isReadHintOrNone(getL3HintAttr())) - return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + return emitOpError("invalid l3_hint: ") << getL3HintAttr(); return success(); } @@ -238,13 +271,13 @@ LogicalResult LoadNdOp::verify() { return emitOpError("Invalid result, it should be a VectorType.\n"); if (!isReadHintOrNone(getL1HintAttr())) - return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + return emitOpError("invalid l1_hint: ") << getL1HintAttr(); if (!isReadHintOrNone(getL2HintAttr())) - return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + return emitOpError("invalid l2_hint: ") << getL2HintAttr(); if (!isReadHintOrNone(getL3HintAttr())) - return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + return emitOpError("invalid l3_hint: ") << getL3HintAttr(); auto array_len = tdescTy.getArrayLength(); auto tdescShape = getShapeOf(tdescTy); @@ -280,8 +313,9 @@ LogicalResult LoadNdOp::verify() { auto it = tdescShape.begin(); tdescShape.insert(it, array_len); } + auto sgMap = tdescTy.getSGMapAttr(); - if (tdescShape != valueShape) + if (!isArgShapesValid(tdescShape, valueShape, sgMap)) return emitOpError() << "Result shape doesn't match TensorDesc shape." << "The expected shape is " << makeString(tdescShape) << ". But the given shape is " @@ -303,17 +337,26 @@ LogicalResult StoreNdOp::verify() { return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!valTy) - return emitOpError("Exepcting a VectorType result.\n"); + return emitOpError("Expecting a VectorType result.\n"); if (!isWriteHintOrNone(getL1HintAttr())) - return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + return emitOpError("invalid l1_hint: ") << getL1HintAttr(); if (!isWriteHintOrNone(getL2HintAttr())) - return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + return emitOpError("invalid l2_hint: ") << getL2HintAttr(); if (!isWriteHintOrNone(getL3HintAttr())) - return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + return emitOpError("invalid l3_hint: ") << getL3HintAttr(); + + auto tdescShape = getShapeOf(dstTy); + auto valueShape = getShapeOf(valTy); + auto sgMap = dstTy.getSGMapAttr(); + if (!isArgShapesValid(tdescShape, valueShape, sgMap)) + return emitOpError() << "Result shape doesn't match TensorDesc shape." + << "The expected shape is " << makeString(tdescShape) + << ". But the given shape is " + << makeString(valueShape) << ".\n"; return success(); } @@ -423,13 +466,13 @@ LogicalResult PrefetchOp::verify() { return emitOpError("Expects a scattered TensorDesc.\n"); if (!isReadHintOrNone(getL1HintAttr())) - return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + return emitOpError("invalid l1_hint: ") << getL1HintAttr(); if (!isReadHintOrNone(getL2HintAttr())) - return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + return emitOpError("invalid l2_hint: ") << getL2HintAttr(); if (!isReadHintOrNone(getL3HintAttr())) - return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + return emitOpError("invalid l3_hint: ") << getL3HintAttr(); return success(); } @@ -446,13 +489,13 @@ LogicalResult LoadGatherOp::verify() { return emitOpError("Expects a scattered TensorDesc.\n"); if (!isReadHintOrNone(getL1HintAttr())) - return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + return emitOpError("invalid l1_hint: ") << getL1HintAttr(); if (!isReadHintOrNone(getL2HintAttr())) - return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + return emitOpError("invalid l2_hint: ") << getL2HintAttr(); if (!isReadHintOrNone(getL3HintAttr())) - return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + return emitOpError("invalid l3_hint: ") << getL3HintAttr(); auto tdescElemTy = tdescTy.getElementType(); auto valueElemTy = getElementType(); @@ -490,13 +533,13 @@ LogicalResult StoreScatterOp::verify() { return emitOpError("Expects a scattered TensorDesc.\n"); if (!isWriteHintOrNone(getL1HintAttr())) - return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + return emitOpError("invalid l1_hint: ") << getL1HintAttr(); if (!isWriteHintOrNone(getL2HintAttr())) - return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + return emitOpError("invalid l2_hint: ") << getL2HintAttr(); if (!isWriteHintOrNone(getL3HintAttr())) - return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + return emitOpError("invalid l3_hint: ") << getL3HintAttr(); auto maskTy = getMaskType(); auto valueTy = getValueType(); diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir index a4587faa3345cb..d7174a489888a4 100644 --- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -86,6 +86,17 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) { gpu.return } +// load_nd args may have different shapes, validated against sg_map +// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> -> vector<8x1xf32> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> -> vector<8x1xf32> + gpu.return +} + // CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> @@ -108,6 +119,19 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) { gpu.return } +// store_nd args may have different shapes, validated against sg_map +// CHECK: func @test_store_nd_vc_3(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_store_nd_vc_3(%src: memref<24x32xf16>) { + // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x2xf16> + %1 = arith.constant dense<1.0>: vector<24x2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map> + %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> + !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map> + // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map> + xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map> + gpu.return +} + // CHECK: gpu.func @test_create_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index f8a0d95bd70a27..7816bff0582f81 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -32,7 +32,7 @@ func.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) { // ----- func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> return } @@ -51,7 +51,7 @@ func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) { // ----- func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> return @@ -77,11 +77,29 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) { return } +// ----- +func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) { + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> + // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> -> vector<8x2xf32> + return +} + +// ----- +func.func @test_load_nd_vc_5(%src: memref<24x32xf32>) { + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<16xf32, #xegpu.sg_map> + // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} + %2 = xegpu.load_nd %1: !xegpu.tensor_desc<16xf32, #xegpu.sg_map> -> vector<16xf32> + return +} + // ----- func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) { %1 = arith.constant dense<1.0>: vector<24x32xf16> %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> - // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> return } @@ -147,7 +165,7 @@ func.func @test_prefetch_vc_2(%src: ui64) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> return } @@ -168,7 +186,7 @@ func.func @test_load_gather_vc_2(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<4x2xf32> @@ -193,7 +211,7 @@ func.func @test_store_scatter_vc_2(%src: ui64) { %1 = arith.constant dense<2.9>: vector<4x2xf32> %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> return From 13d09dfad6d1f6a15721688822ce33b74b44a8d8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 22 Jan 2025 09:07:58 -0800 Subject: [PATCH 003/208] [X86] Simplify ArrayRef construction. NFC (#123899) I think the std::begin/end were to work around an old gcc bug. Hopefully we don't need them anymore. --- llvm/lib/Target/X86/X86CallingConv.cpp | 8 ++++---- llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index b85d9d9a7e535b..7359ef341dde58 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -68,23 +68,23 @@ static ArrayRef CC_X86_VectorCallGetSSEs(const MVT &ValVT) { if (ValVT.is512BitVector()) { static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2, X86::ZMM3, X86::ZMM4, X86::ZMM5}; - return ArrayRef(std::begin(RegListZMM), std::end(RegListZMM)); + return RegListZMM; } if (ValVT.is256BitVector()) { static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2, X86::YMM3, X86::YMM4, X86::YMM5}; - return ArrayRef(std::begin(RegListYMM), std::end(RegListYMM)); + return RegListYMM; } static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5}; - return ArrayRef(std::begin(RegListXMM), std::end(RegListXMM)); + return RegListXMM; } static ArrayRef CC_X86_64_VectorCallGetGPRs() { static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9}; - return ArrayRef(std::begin(RegListGPR), std::end(RegListGPR)); + return RegListGPR; } static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT, diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 10aa2a5e5dac8a..4a4fd246cb7cdf 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -1416,13 +1416,13 @@ static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; - return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); + return GPR64ArgRegsWin64; } static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; - return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); + return GPR64ArgRegs64Bit; } // FIXME: Get this from tablegen. @@ -1448,7 +1448,7 @@ static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); + return XMMArgRegs64Bit; } #ifndef NDEBUG From 9e6494c0fb29dfb5d4d2b7bf3ed7af261efee034 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 22 Jan 2025 09:11:22 -0800 Subject: [PATCH 004/208] [CodeGen] Rename RegisterMaskPair to VRegMaskOrUnit. NFC (#123799) This holds a physical register unit or virtual register and mask. While I was here I've used emplace_back and removed an unneeded use of a template. --- llvm/include/llvm/CodeGen/MachineScheduler.h | 2 +- llvm/include/llvm/CodeGen/RegisterPressure.h | 41 ++++--- llvm/lib/CodeGen/MachinePipeliner.cpp | 8 +- llvm/lib/CodeGen/MachineScheduler.cpp | 11 +- llvm/lib/CodeGen/RegisterPressure.cpp | 116 +++++++++---------- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 20 ++-- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 2 +- 7 files changed, 97 insertions(+), 103 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 3dd62b2ba333c3..660670ccdcd75b 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -525,7 +525,7 @@ class ScheduleDAGMILive : public ScheduleDAGMI { void initRegPressure(); - void updatePressureDiffs(ArrayRef LiveUses); + void updatePressureDiffs(ArrayRef LiveUses); void updateScheduledPressure(const SUnit *SU, const std::vector &NewMaxPressure); diff --git a/llvm/include/llvm/CodeGen/RegisterPressure.h b/llvm/include/llvm/CodeGen/RegisterPressure.h index 8a46e505affd2f..407afee343ce2b 100644 --- a/llvm/include/llvm/CodeGen/RegisterPressure.h +++ b/llvm/include/llvm/CodeGen/RegisterPressure.h @@ -35,11 +35,11 @@ class MachineInstr; class MachineRegisterInfo; class RegisterClassInfo; -struct RegisterMaskPair { +struct VRegMaskOrUnit { Register RegUnit; ///< Virtual register or register unit. LaneBitmask LaneMask; - RegisterMaskPair(Register RegUnit, LaneBitmask LaneMask) + VRegMaskOrUnit(Register RegUnit, LaneBitmask LaneMask) : RegUnit(RegUnit), LaneMask(LaneMask) {} }; @@ -49,8 +49,8 @@ struct RegisterPressure { std::vector MaxSetPressure; /// List of live in virtual registers or physical register units. - SmallVector LiveInRegs; - SmallVector LiveOutRegs; + SmallVector LiveInRegs; + SmallVector LiveOutRegs; void dump(const TargetRegisterInfo *TRI) const; }; @@ -166,13 +166,13 @@ class PressureDiff { class RegisterOperands { public: /// List of virtual registers and register units read by the instruction. - SmallVector Uses; + SmallVector Uses; /// List of virtual registers and register units defined by the /// instruction which are not dead. - SmallVector Defs; + SmallVector Defs; /// List of virtual registers and register units defined by the /// instruction but dead. - SmallVector DeadDefs; + SmallVector DeadDefs; /// Analyze the given instruction \p MI and fill in the Uses, Defs and /// DeadDefs list based on the MachineOperand flags. @@ -185,7 +185,7 @@ class RegisterOperands { void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS); /// Use liveness information to find out which uses/defs are partially - /// undefined/dead and adjust the RegisterMaskPairs accordingly. + /// undefined/dead and adjust the VRegMaskOrUnits accordingly. /// If \p AddFlagsMI is given then missing read-undef and dead flags will be /// added to the instruction. void adjustLaneLiveness(const LiveIntervals &LIS, @@ -303,7 +303,7 @@ class LiveRegSet { /// Mark the \p Pair.LaneMask lanes of \p Pair.Reg as live. /// Returns the previously live lanes of \p Pair.Reg. - LaneBitmask insert(RegisterMaskPair Pair) { + LaneBitmask insert(VRegMaskOrUnit Pair) { unsigned SparseIndex = getSparseIndexFromReg(Pair.RegUnit); auto InsertRes = Regs.insert(IndexMaskPair(SparseIndex, Pair.LaneMask)); if (!InsertRes.second) { @@ -316,7 +316,7 @@ class LiveRegSet { /// Clears the \p Pair.LaneMask lanes of \p Pair.Reg (mark them as dead). /// Returns the previously live lanes of \p Pair.Reg. - LaneBitmask erase(RegisterMaskPair Pair) { + LaneBitmask erase(VRegMaskOrUnit Pair) { unsigned SparseIndex = getSparseIndexFromReg(Pair.RegUnit); RegSet::iterator I = Regs.find(SparseIndex); if (I == Regs.end()) @@ -330,12 +330,11 @@ class LiveRegSet { return Regs.size(); } - template - void appendTo(ContainerT &To) const { + void appendTo(SmallVectorImpl &To) const { for (const IndexMaskPair &P : Regs) { Register Reg = getRegFromSparseIndex(P.Index); if (P.LaneMask.any()) - To.push_back(RegisterMaskPair(Reg, P.LaneMask)); + To.emplace_back(Reg, P.LaneMask); } } }; @@ -409,7 +408,7 @@ class RegPressureTracker { /// Force liveness of virtual registers or physical register /// units. Particularly useful to initialize the livein/out state of the /// tracker before the first call to advance/recede. - void addLiveRegs(ArrayRef Regs); + void addLiveRegs(ArrayRef Regs); /// Get the MI position corresponding to this register pressure. MachineBasicBlock::const_iterator getPos() const { return CurrPos; } @@ -421,14 +420,14 @@ class RegPressureTracker { void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; } /// Recede across the previous instruction. - void recede(SmallVectorImpl *LiveUses = nullptr); + void recede(SmallVectorImpl *LiveUses = nullptr); /// Recede across the previous instruction. /// This "low-level" variant assumes that recedeSkipDebugValues() was /// called previously and takes precomputed RegisterOperands for the /// instruction. void recede(const RegisterOperands &RegOpers, - SmallVectorImpl *LiveUses = nullptr); + SmallVectorImpl *LiveUses = nullptr); /// Recede until we find an instruction which is not a DebugValue. void recedeSkipDebugValues(); @@ -546,21 +545,21 @@ class RegPressureTracker { protected: /// Add Reg to the live out set and increase max pressure. - void discoverLiveOut(RegisterMaskPair Pair); + void discoverLiveOut(VRegMaskOrUnit Pair); /// Add Reg to the live in set and increase max pressure. - void discoverLiveIn(RegisterMaskPair Pair); + void discoverLiveIn(VRegMaskOrUnit Pair); /// Get the SlotIndex for the first nondebug instruction including or /// after the current position. SlotIndex getCurrSlot() const; - void bumpDeadDefs(ArrayRef DeadDefs); + void bumpDeadDefs(ArrayRef DeadDefs); void bumpUpwardPressure(const MachineInstr *MI); void bumpDownwardPressure(const MachineInstr *MI); - void discoverLiveInOrOut(RegisterMaskPair Pair, - SmallVectorImpl &LiveInOrOut); + void discoverLiveInOrOut(VRegMaskOrUnit Pair, + SmallVectorImpl &LiveInOrOut); LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; LaneBitmask getLiveLanesAt(Register RegUnit, SlotIndex Pos) const; diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 54d9c1cf08e35b..0d5dc961590036 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -1981,7 +1981,7 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS) { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - SmallVector LiveOutRegs; + SmallVector LiveOutRegs; SmallSet Uses; for (SUnit *SU : NS) { const MachineInstr *MI = SU->getInstr(); @@ -2002,13 +2002,11 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, Register Reg = MO.getReg(); if (Reg.isVirtual()) { if (!Uses.count(Reg)) - LiveOutRegs.push_back(RegisterMaskPair(Reg, - LaneBitmask::getNone())); + LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone()); } else if (MRI.isAllocatable(Reg)) { for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) if (!Uses.count(Unit)) - LiveOutRegs.push_back( - RegisterMaskPair(Unit, LaneBitmask::getNone())); + LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone()); } } RPTracker.addLiveRegs(LiveOutRegs); diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 91aaeea156c4a1..393530f56cc27e 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1288,7 +1288,7 @@ void ScheduleDAGMILive::initRegPressure() { // Account for liveness generated by the region boundary. if (LiveRegionEnd != RegionEnd) { - SmallVector LiveUses; + SmallVector LiveUses; BotRPTracker.recede(&LiveUses); updatePressureDiffs(LiveUses); } @@ -1352,9 +1352,8 @@ updateScheduledPressure(const SUnit *SU, /// Update the PressureDiff array for liveness after scheduling this /// instruction. -void ScheduleDAGMILive::updatePressureDiffs( - ArrayRef LiveUses) { - for (const RegisterMaskPair &P : LiveUses) { +void ScheduleDAGMILive::updatePressureDiffs(ArrayRef LiveUses) { + for (const VRegMaskOrUnit &P : LiveUses) { Register Reg = P.RegUnit; /// FIXME: Currently assuming single-use physregs. if (!Reg.isVirtual()) @@ -1579,7 +1578,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { unsigned MaxCyclicLatency = 0; // Visit each live out vreg def to find def/use pairs that cross iterations. - for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) { + for (const VRegMaskOrUnit &P : RPTracker.getPressure().LiveOutRegs) { Register Reg = P.RegUnit; if (!Reg.isVirtual()) continue; @@ -1707,7 +1706,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { if (BotRPTracker.getPos() != CurrentBottom) BotRPTracker.recedeSkipDebugValues(); - SmallVector LiveUses; + SmallVector LiveUses; BotRPTracker.recede(RegOpers, &LiveUses); assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure( diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp index 037986ec48afbc..e8e6db1e3b3bd1 100644 --- a/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/llvm/lib/CodeGen/RegisterPressure.cpp @@ -95,7 +95,7 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const { dbgs() << "Max Pressure: "; dumpRegSetPressure(MaxSetPressure, TRI); dbgs() << "Live In: "; - for (const RegisterMaskPair &P : LiveInRegs) { + for (const VRegMaskOrUnit &P : LiveInRegs) { dbgs() << printVRegOrUnit(P.RegUnit, TRI); if (!P.LaneMask.all()) dbgs() << ':' << PrintLaneMask(P.LaneMask); @@ -103,7 +103,7 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const { } dbgs() << '\n'; dbgs() << "Live Out: "; - for (const RegisterMaskPair &P : LiveOutRegs) { + for (const VRegMaskOrUnit &P : LiveOutRegs) { dbgs() << printVRegOrUnit(P.RegUnit, TRI); if (!P.LaneMask.all()) dbgs() << ':' << PrintLaneMask(P.LaneMask); @@ -358,7 +358,7 @@ void RegPressureTracker::closeRegion() { void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) { LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0); assert(isBottomClosed() && "need bottom-up tracking to intialize."); - for (const RegisterMaskPair &Pair : P.LiveOutRegs) { + for (const VRegMaskOrUnit &Pair : P.LiveOutRegs) { Register RegUnit = Pair.RegUnit; if (RegUnit.isVirtual() && !RPTracker.hasUntiedDef(RegUnit)) increaseSetPressure(LiveThruPressure, *MRI, RegUnit, @@ -366,9 +366,9 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) { } } -static LaneBitmask getRegLanes(ArrayRef RegUnits, +static LaneBitmask getRegLanes(ArrayRef RegUnits, Register RegUnit) { - auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) @@ -376,11 +376,11 @@ static LaneBitmask getRegLanes(ArrayRef RegUnits, return I->LaneMask; } -static void addRegLanes(SmallVectorImpl &RegUnits, - RegisterMaskPair Pair) { +static void addRegLanes(SmallVectorImpl &RegUnits, + VRegMaskOrUnit Pair) { Register RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { @@ -390,23 +390,23 @@ static void addRegLanes(SmallVectorImpl &RegUnits, } } -static void setRegZero(SmallVectorImpl &RegUnits, +static void setRegZero(SmallVectorImpl &RegUnits, Register RegUnit) { - auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { - RegUnits.push_back(RegisterMaskPair(RegUnit, LaneBitmask::getNone())); + RegUnits.emplace_back(RegUnit, LaneBitmask::getNone()); } else { I->LaneMask = LaneBitmask::getNone(); } } -static void removeRegLanes(SmallVectorImpl &RegUnits, - RegisterMaskPair Pair) { +static void removeRegLanes(SmallVectorImpl &RegUnits, + VRegMaskOrUnit Pair) { Register RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) { return Other.RegUnit == RegUnit; }); if (I != RegUnits.end()) { @@ -480,7 +480,7 @@ class RegisterOperandsCollector { collectOperand(*OperI); // Remove redundant physreg dead defs. - for (const RegisterMaskPair &P : RegOpers.Defs) + for (const VRegMaskOrUnit &P : RegOpers.Defs) removeRegLanes(RegOpers.DeadDefs, P); } @@ -489,7 +489,7 @@ class RegisterOperandsCollector { collectOperandLanes(*OperI); // Remove redundant physreg dead defs. - for (const RegisterMaskPair &P : RegOpers.Defs) + for (const VRegMaskOrUnit &P : RegOpers.Defs) removeRegLanes(RegOpers.DeadDefs, P); } @@ -515,13 +515,12 @@ class RegisterOperandsCollector { } } - void pushReg(Register Reg, - SmallVectorImpl &RegUnits) const { + void pushReg(Register Reg, SmallVectorImpl &RegUnits) const { if (Reg.isVirtual()) { - addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneBitmask::getAll())); + addRegLanes(RegUnits, VRegMaskOrUnit(Reg, LaneBitmask::getAll())); } else if (MRI.isAllocatable(Reg)) { for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) - addRegLanes(RegUnits, RegisterMaskPair(Unit, LaneBitmask::getAll())); + addRegLanes(RegUnits, VRegMaskOrUnit(Unit, LaneBitmask::getAll())); } } @@ -548,15 +547,15 @@ class RegisterOperandsCollector { } void pushRegLanes(Register Reg, unsigned SubRegIdx, - SmallVectorImpl &RegUnits) const { + SmallVectorImpl &RegUnits) const { if (Reg.isVirtual()) { LaneBitmask LaneMask = SubRegIdx != 0 ? TRI.getSubRegIndexLaneMask(SubRegIdx) : MRI.getMaxLaneMaskForVReg(Reg); - addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneMask)); + addRegLanes(RegUnits, VRegMaskOrUnit(Reg, LaneMask)); } else if (MRI.isAllocatable(Reg)) { for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) - addRegLanes(RegUnits, RegisterMaskPair(Unit, LaneBitmask::getAll())); + addRegLanes(RegUnits, VRegMaskOrUnit(Unit, LaneBitmask::getAll())); } } }; @@ -622,7 +621,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS, LaneMask = getLiveLanesAt(LIS, MRI, true, RegUnit, Pos.getBaseIndex()); if (AddFlagsMI != nullptr) { - for (const RegisterMaskPair &P : DeadDefs) { + for (const VRegMaskOrUnit &P : DeadDefs) { Register RegUnit = P.RegUnit; if (!RegUnit.isVirtual()) continue; @@ -651,10 +650,10 @@ void PressureDiffs::addInstruction(unsigned Idx, const MachineRegisterInfo &MRI) { PressureDiff &PDiff = (*this)[Idx]; assert(!PDiff.begin()->isValid() && "stale PDiff"); - for (const RegisterMaskPair &P : RegOpers.Defs) + for (const VRegMaskOrUnit &P : RegOpers.Defs) PDiff.addPressureChange(P.RegUnit, true, &MRI); - for (const RegisterMaskPair &P : RegOpers.Uses) + for (const VRegMaskOrUnit &P : RegOpers.Uses) PDiff.addPressureChange(P.RegUnit, false, &MRI); } @@ -694,20 +693,20 @@ void PressureDiff::addPressureChange(Register RegUnit, bool IsDec, } /// Force liveness of registers. -void RegPressureTracker::addLiveRegs(ArrayRef Regs) { - for (const RegisterMaskPair &P : Regs) { +void RegPressureTracker::addLiveRegs(ArrayRef Regs) { + for (const VRegMaskOrUnit &P : Regs) { LaneBitmask PrevMask = LiveRegs.insert(P); LaneBitmask NewMask = PrevMask | P.LaneMask; increaseRegPressure(P.RegUnit, PrevMask, NewMask); } } -void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair, - SmallVectorImpl &LiveInOrOut) { +void RegPressureTracker::discoverLiveInOrOut( + VRegMaskOrUnit Pair, SmallVectorImpl &LiveInOrOut) { assert(Pair.LaneMask.any()); Register RegUnit = Pair.RegUnit; - auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { + auto I = llvm::find_if(LiveInOrOut, [RegUnit](const VRegMaskOrUnit &Other) { return Other.RegUnit == RegUnit; }); LaneBitmask PrevMask; @@ -724,22 +723,22 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair, increaseSetPressure(P.MaxSetPressure, *MRI, RegUnit, PrevMask, NewMask); } -void RegPressureTracker::discoverLiveIn(RegisterMaskPair Pair) { +void RegPressureTracker::discoverLiveIn(VRegMaskOrUnit Pair) { discoverLiveInOrOut(Pair, P.LiveInRegs); } -void RegPressureTracker::discoverLiveOut(RegisterMaskPair Pair) { +void RegPressureTracker::discoverLiveOut(VRegMaskOrUnit Pair) { discoverLiveInOrOut(Pair, P.LiveOutRegs); } -void RegPressureTracker::bumpDeadDefs(ArrayRef DeadDefs) { - for (const RegisterMaskPair &P : DeadDefs) { +void RegPressureTracker::bumpDeadDefs(ArrayRef DeadDefs) { + for (const VRegMaskOrUnit &P : DeadDefs) { Register Reg = P.RegUnit; LaneBitmask LiveMask = LiveRegs.contains(Reg); LaneBitmask BumpedMask = LiveMask | P.LaneMask; increaseRegPressure(Reg, LiveMask, BumpedMask); } - for (const RegisterMaskPair &P : DeadDefs) { + for (const VRegMaskOrUnit &P : DeadDefs) { Register Reg = P.RegUnit; LaneBitmask LiveMask = LiveRegs.contains(Reg); LaneBitmask BumpedMask = LiveMask | P.LaneMask; @@ -753,7 +752,7 @@ void RegPressureTracker::bumpDeadDefs(ArrayRef DeadDefs) { /// difference pointer is provided record the changes is pressure caused by this /// instruction independent of liveness. void RegPressureTracker::recede(const RegisterOperands &RegOpers, - SmallVectorImpl *LiveUses) { + SmallVectorImpl *LiveUses) { assert(!CurrPos->isDebugOrPseudoInstr()); // Boost pressure for all dead defs together. @@ -761,7 +760,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, // Kill liveness at live defs. // TODO: consider earlyclobbers? - for (const RegisterMaskPair &Def : RegOpers.Defs) { + for (const VRegMaskOrUnit &Def : RegOpers.Defs) { Register Reg = Def.RegUnit; LaneBitmask PreviousMask = LiveRegs.erase(Def); @@ -769,7 +768,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, LaneBitmask LiveOut = Def.LaneMask & ~PreviousMask; if (LiveOut.any()) { - discoverLiveOut(RegisterMaskPair(Reg, LiveOut)); + discoverLiveOut(VRegMaskOrUnit(Reg, LiveOut)); // Retroactively model effects on pressure of the live out lanes. increaseSetPressure(CurrSetPressure, *MRI, Reg, LaneBitmask::getNone(), LiveOut); @@ -791,7 +790,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot(); // Generate liveness for uses. - for (const RegisterMaskPair &Use : RegOpers.Uses) { + for (const VRegMaskOrUnit &Use : RegOpers.Uses) { Register Reg = Use.RegUnit; assert(Use.LaneMask.any()); LaneBitmask PreviousMask = LiveRegs.insert(Use); @@ -803,19 +802,18 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, if (PreviousMask.none()) { if (LiveUses != nullptr) { if (!TrackLaneMasks) { - addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask)); + addRegLanes(*LiveUses, VRegMaskOrUnit(Reg, NewMask)); } else { - auto I = - llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { - return Other.RegUnit == Reg; - }); + auto I = llvm::find_if(*LiveUses, [Reg](const VRegMaskOrUnit Other) { + return Other.RegUnit == Reg; + }); bool IsRedef = I != LiveUses->end(); if (IsRedef) { // ignore re-defs here... assert(I->LaneMask.none()); - removeRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask)); + removeRegLanes(*LiveUses, VRegMaskOrUnit(Reg, NewMask)); } else { - addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask)); + addRegLanes(*LiveUses, VRegMaskOrUnit(Reg, NewMask)); } } } @@ -824,14 +822,14 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, if (RequireIntervals) { LaneBitmask LiveOut = getLiveThroughAt(Reg, SlotIdx); if (LiveOut.any()) - discoverLiveOut(RegisterMaskPair(Reg, LiveOut)); + discoverLiveOut(VRegMaskOrUnit(Reg, LiveOut)); } } increaseRegPressure(Reg, PreviousMask, NewMask); } if (TrackUntiedDefs) { - for (const RegisterMaskPair &Def : RegOpers.Defs) { + for (const VRegMaskOrUnit &Def : RegOpers.Defs) { Register RegUnit = Def.RegUnit; if (RegUnit.isVirtual() && (LiveRegs.contains(RegUnit) & Def.LaneMask).none()) @@ -861,7 +859,7 @@ void RegPressureTracker::recedeSkipDebugValues() { static_cast(P).openTop(SlotIdx); } -void RegPressureTracker::recede(SmallVectorImpl *LiveUses) { +void RegPressureTracker::recede(SmallVectorImpl *LiveUses) { recedeSkipDebugValues(); if (CurrPos->isDebugInstr() || CurrPos->isPseudoProbe()) { // It's possible to only have debug_value and pseudo probe instructions and @@ -902,27 +900,27 @@ void RegPressureTracker::advance(const RegisterOperands &RegOpers) { static_cast(P).openBottom(CurrPos); } - for (const RegisterMaskPair &Use : RegOpers.Uses) { + for (const VRegMaskOrUnit &Use : RegOpers.Uses) { Register Reg = Use.RegUnit; LaneBitmask LiveMask = LiveRegs.contains(Reg); LaneBitmask LiveIn = Use.LaneMask & ~LiveMask; if (LiveIn.any()) { - discoverLiveIn(RegisterMaskPair(Reg, LiveIn)); + discoverLiveIn(VRegMaskOrUnit(Reg, LiveIn)); increaseRegPressure(Reg, LiveMask, LiveMask | LiveIn); - LiveRegs.insert(RegisterMaskPair(Reg, LiveIn)); + LiveRegs.insert(VRegMaskOrUnit(Reg, LiveIn)); } // Kill liveness at last uses. if (RequireIntervals) { LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx); if (LastUseMask.any()) { - LiveRegs.erase(RegisterMaskPair(Reg, LastUseMask)); + LiveRegs.erase(VRegMaskOrUnit(Reg, LastUseMask)); decreaseRegPressure(Reg, LiveMask, LiveMask & ~LastUseMask); } } } // Generate liveness for defs. - for (const RegisterMaskPair &Def : RegOpers.Defs) { + for (const VRegMaskOrUnit &Def : RegOpers.Defs) { LaneBitmask PreviousMask = LiveRegs.insert(Def); LaneBitmask NewMask = PreviousMask | Def.LaneMask; increaseRegPressure(Def.RegUnit, PreviousMask, NewMask); @@ -1051,7 +1049,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { bumpDeadDefs(RegOpers.DeadDefs); // Kill liveness at live defs. - for (const RegisterMaskPair &P : RegOpers.Defs) { + for (const VRegMaskOrUnit &P : RegOpers.Defs) { Register Reg = P.RegUnit; LaneBitmask LiveAfter = LiveRegs.contains(Reg); LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg); @@ -1063,7 +1061,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { decreaseRegPressure(Reg, LiveAfter, LiveAfter & LiveBefore); } // Generate liveness for uses. Also handle any uses which overlap with defs. - for (const RegisterMaskPair &P : RegOpers.Uses) { + for (const VRegMaskOrUnit &P : RegOpers.Uses) { Register Reg = P.RegUnit; LaneBitmask LiveAfter = LiveRegs.contains(Reg); LaneBitmask LiveBefore = LiveAfter | P.LaneMask; @@ -1288,7 +1286,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx); if (RequireIntervals) { - for (const RegisterMaskPair &Use : RegOpers.Uses) { + for (const VRegMaskOrUnit &Use : RegOpers.Uses) { Register Reg = Use.RegUnit; LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx); if (LastUseMask.none()) @@ -1311,7 +1309,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { } // Generate liveness for defs. - for (const RegisterMaskPair &Def : RegOpers.Defs) { + for (const VRegMaskOrUnit &Def : RegOpers.Defs) { Register Reg = Def.RegUnit; LaneBitmask LiveMask = LiveRegs.contains(Reg); LaneBitmask NewMask = LiveMask | Def.LaneMask; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index d46c4cf23a221e..a438ad00bc41df 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -256,7 +256,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO, } static void -collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, +collectVirtualRegUses(SmallVectorImpl &VRegMaskOrUnits, const MachineInstr &MI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { @@ -268,12 +268,12 @@ collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, continue; Register Reg = MO.getReg(); - auto I = llvm::find_if(RegMaskPairs, [Reg](const RegisterMaskPair &RM) { + auto I = llvm::find_if(VRegMaskOrUnits, [Reg](const VRegMaskOrUnit &RM) { return RM.RegUnit == Reg; }); - auto &P = I == RegMaskPairs.end() - ? RegMaskPairs.emplace_back(Reg, LaneBitmask::getNone()) + auto &P = I == VRegMaskOrUnits.end() + ? VRegMaskOrUnits.emplace_back(Reg, LaneBitmask::getNone()) : *I; P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg()) @@ -281,7 +281,7 @@ collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, } SlotIndex InstrSI; - for (auto &P : RegMaskPairs) { + for (auto &P : VRegMaskOrUnits) { auto &LI = LIS.getInterval(P.RegUnit); if (!LI.hasSubRanges()) continue; @@ -477,9 +477,9 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { MaxPressure = max(DefPressure, MaxPressure); // Make uses alive. - SmallVector RegUses; + SmallVector RegUses; collectVirtualRegUses(RegUses, MI, LIS, *MRI); - for (const RegisterMaskPair &U : RegUses) { + for (const VRegMaskOrUnit &U : RegUses) { LaneBitmask &LiveMask = LiveRegs[U.RegUnit]; LaneBitmask PrevMask = LiveMask; LiveMask |= U.LaneMask; @@ -665,7 +665,7 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); GCNRegPressure TempPressure = CurPressure; - for (const RegisterMaskPair &Use : RegOpers.Uses) { + for (const VRegMaskOrUnit &Use : RegOpers.Uses) { Register Reg = Use.RegUnit; if (!Reg.isVirtual()) continue; @@ -699,7 +699,7 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, } // Generate liveness for defs. - for (const RegisterMaskPair &Def : RegOpers.Defs) { + for (const VRegMaskOrUnit &Def : RegOpers.Defs) { Register Reg = Def.RegUnit; if (!Reg.isVirtual()) continue; @@ -908,4 +908,4 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { return false; #undef PFX -} \ No newline at end of file +} diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 06c3d9027db1b5..7554b9f578fcbb 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -170,7 +170,7 @@ class GCNRPTracker { bool After); /// Mostly copy/paste from CodeGen/RegisterPressure.cpp - void bumpDeadDefs(ArrayRef DeadDefs); + void bumpDeadDefs(ArrayRef DeadDefs); LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; From f63e8ed16ef1fd2deb80cd88b5ca9d5b631b1c36 Mon Sep 17 00:00:00 2001 From: Ilya Biryukov Date: Wed, 22 Jan 2025 18:09:31 +0100 Subject: [PATCH 005/208] =?UTF-8?q?Revert=20"[Modules]=20Delay=20deseriali?= =?UTF-8?q?zation=20of=20preferred=5Fname=20attribute=20at=20r=E2=80=A6=20?= =?UTF-8?q?(#122726)"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c3ba6f378ef80d750e2278560c6f95a300114412. We are seeing performance regressions of up to 40% on some compilations with this patch, we will investigate and reland after fixing performance issues. --- clang/include/clang/AST/Attr.h | 14 +--- clang/include/clang/Basic/Attr.td | 11 --- clang/include/clang/Serialization/ASTReader.h | 19 ----- .../clang/Serialization/ASTRecordReader.h | 13 +-- clang/lib/Serialization/ASTReader.cpp | 5 -- clang/lib/Serialization/ASTReaderDecl.cpp | 79 +------------------ clang/lib/Serialization/ASTWriter.cpp | 16 ++-- clang/test/Modules/preferred_name.cppm | 12 +-- clang/utils/TableGen/ClangAttrEmitter.cpp | 4 - 9 files changed, 17 insertions(+), 156 deletions(-) diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index bed532a84a1bde..3365ebe4d9012b 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -60,8 +60,6 @@ class Attr : public AttributeCommonInfo { unsigned IsLateParsed : 1; LLVM_PREFERRED_TYPE(bool) unsigned InheritEvenIfAlreadyPresent : 1; - LLVM_PREFERRED_TYPE(bool) - unsigned DeferDeserialization : 1; void *operator new(size_t bytes) noexcept { llvm_unreachable("Attrs cannot be allocated with regular 'new'."); @@ -82,11 +80,10 @@ class Attr : public AttributeCommonInfo { protected: Attr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, - attr::Kind AK, bool IsLateParsed, bool DeferDeserialization = false) + attr::Kind AK, bool IsLateParsed) : AttributeCommonInfo(CommonInfo), AttrKind(AK), Inherited(false), IsPackExpansion(false), Implicit(false), IsLateParsed(IsLateParsed), - InheritEvenIfAlreadyPresent(false), - DeferDeserialization(DeferDeserialization) {} + InheritEvenIfAlreadyPresent(false) {} public: attr::Kind getKind() const { return static_cast(AttrKind); } @@ -108,8 +105,6 @@ class Attr : public AttributeCommonInfo { void setPackExpansion(bool PE) { IsPackExpansion = PE; } bool isPackExpansion() const { return IsPackExpansion; } - bool shouldDeferDeserialization() const { return DeferDeserialization; } - // Clone this attribute. Attr *clone(ASTContext &C) const; @@ -151,9 +146,8 @@ class InheritableAttr : public Attr { protected: InheritableAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, attr::Kind AK, bool IsLateParsed, - bool InheritEvenIfAlreadyPresent, - bool DeferDeserialization = false) - : Attr(Context, CommonInfo, AK, IsLateParsed, DeferDeserialization) { + bool InheritEvenIfAlreadyPresent) + : Attr(Context, CommonInfo, AK, IsLateParsed) { this->InheritEvenIfAlreadyPresent = InheritEvenIfAlreadyPresent; } diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 3969dd8af5dfae..408d3adf370c85 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -713,12 +713,6 @@ class Attr { // attribute may be documented under multiple categories, more than one // Documentation entry may be listed. list Documentation; - // Set to true if deserialization of this attribute must be deferred until - // the parent Decl is fully deserialized (during header module file - // deserialization). E.g., this is the case for the preferred_name attribute, - // since its type deserialization depends on its target Decl type. - // (See https://github.com/llvm/llvm-project/issues/56490 for details). - bit DeferDeserialization = 0; } /// Used to define a set of mutually exclusive attributes. @@ -3260,11 +3254,6 @@ def PreferredName : InheritableAttr { let InheritEvenIfAlreadyPresent = 1; let MeaningfulToClassTemplateDefinition = 1; let TemplateDependent = 1; - // Type of this attribute depends on the target Decl type. - // Therefore, its deserialization must be deferred until - // deserialization of the target Decl is complete - // (for header modules). - let DeferDeserialization = 1; } def PreserveMost : DeclOrTypeAttr { diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 82564fe664acba..7530015c9dacf3 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -1236,24 +1236,6 @@ class ASTReader /// been completed. std::deque PendingDeclContextInfos; - /// Deserialization of some attributes must be deferred since they refer - /// to themselves in their type (e.g., preferred_name attribute refers to the - /// typedef that refers back to the template specialization of the template - /// that the attribute is attached to). - /// More attributes that store TypeSourceInfo might be potentially affected, - /// see https://github.com/llvm/llvm-project/issues/56490 for details. - struct DeferredAttribute { - // Index of the deferred attribute in the Record of the TargetedDecl. - uint64_t RecordIdx; - // Decl to attach a deferred attribute to. - Decl *TargetedDecl; - }; - - /// The collection of Decls that have been loaded but some of their attributes - /// have been deferred, paired with the index inside the record pointing - /// at the skipped attribute. - SmallVector PendingDeferredAttributes; - template using DuplicateObjCDecls = std::pair; @@ -1606,7 +1588,6 @@ class ASTReader void loadPendingDeclChain(Decl *D, uint64_t LocalOffset); void loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D, unsigned PreviousGeneration = 0); - void loadDeferredAttribute(const DeferredAttribute &DA); RecordLocation getLocalBitOffset(uint64_t GlobalOffset); uint64_t getGlobalBitOffset(ModuleFile &M, uint64_t LocalOffset); diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h index a29972fcf73a8d..2561418b78ca7f 100644 --- a/clang/include/clang/Serialization/ASTRecordReader.h +++ b/clang/include/clang/Serialization/ASTRecordReader.h @@ -83,12 +83,6 @@ class ASTRecordReader /// Returns the current value in this record, without advancing. uint64_t peekInt() { return Record[Idx]; } - /// Returns the next N values in this record, without advancing. - uint64_t peekInts(unsigned N) { return Record[Idx + N]; } - - /// Skips the current value. - void skipInt() { Idx += 1; } - /// Skips the specified number of values. void skipInts(unsigned N) { Idx += N; } @@ -341,12 +335,7 @@ class ASTRecordReader Attr *readAttr(); /// Reads attributes from the current stream position, advancing Idx. - /// For some attributes (where type depends on itself recursively), defer - /// reading the attribute until the type has been read. - void readAttributes(AttrVec &Attrs, Decl *D = nullptr); - - /// Reads one attribute from the current stream position, advancing Idx. - Attr *readOrDeferAttrFor(Decl *D); + void readAttributes(AttrVec &Attrs); /// Read an BTFTypeTagAttr object. BTFTypeTagAttr *readBTFTypeTagAttr() { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index a72ff766685bbe..08801d22fdca86 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -10239,11 +10239,6 @@ void ASTReader::finishPendingActions() { } PendingDeducedVarTypes.clear(); - // Load the delayed preferred name attributes. - for (unsigned I = 0; I != PendingDeferredAttributes.size(); ++I) - loadDeferredAttribute(PendingDeferredAttributes[I]); - PendingDeferredAttributes.clear(); - // For each decl chain that we wanted to complete while deserializing, mark // it as "still needs to be completed". for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) { diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index de834285fa76b2..72191395ec8067 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -613,7 +613,7 @@ void ASTDeclReader::VisitDecl(Decl *D) { if (HasAttrs) { AttrVec Attrs; - Record.readAttributes(Attrs, D); + Record.readAttributes(Attrs); // Avoid calling setAttrs() directly because it uses Decl::getASTContext() // internally which is unsafe during derialization. D->setAttrsImpl(Attrs, Reader.getContext()); @@ -3098,8 +3098,6 @@ class AttrReader { return Reader.readInt(); } - uint64_t peekInts(unsigned N) { return Reader.peekInts(N); } - bool readBool() { return Reader.readBool(); } SourceRange readSourceRange() { @@ -3130,29 +3128,18 @@ class AttrReader { return Reader.readVersionTuple(); } - void skipInt() { Reader.skipInts(1); } - - void skipInts(unsigned N) { Reader.skipInts(N); } - - unsigned getCurrentIdx() { return Reader.getIdx(); } - OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); } template T *readDeclAs() { return Reader.readDeclAs(); } }; } -/// Reads one attribute from the current stream position, advancing Idx. Attr *ASTRecordReader::readAttr() { AttrReader Record(*this); auto V = Record.readInt(); if (!V) return nullptr; - // Read and ignore the skip count, since attribute deserialization is not - // deferred on this pass. - Record.skipInt(); - Attr *New = nullptr; // Kind is stored as a 1-based integer because 0 is used to indicate a null // Attr pointer. @@ -3182,28 +3169,13 @@ Attr *ASTRecordReader::readAttr() { return New; } -/// Reads attributes from the current stream position, advancing Idx. -/// For some attributes (where type depends on itself recursively), defer -/// reading the attribute until the type has been read. -void ASTRecordReader::readAttributes(AttrVec &Attrs, Decl *D) { +/// Reads attributes from the current stream position. +void ASTRecordReader::readAttributes(AttrVec &Attrs) { for (unsigned I = 0, E = readInt(); I != E; ++I) - if (auto *A = readOrDeferAttrFor(D)) + if (auto *A = readAttr()) Attrs.push_back(A); } -/// Reads one attribute from the current stream position, advancing Idx. -/// For some attributes (where type depends on itself recursively), defer -/// reading the attribute until the type has been read. -Attr *ASTRecordReader::readOrDeferAttrFor(Decl *D) { - AttrReader Record(*this); - unsigned SkipCount = Record.peekInts(1); - if (!SkipCount) - return readAttr(); - Reader->PendingDeferredAttributes.push_back({Record.getCurrentIdx(), D}); - Record.skipInts(SkipCount); - return nullptr; -} - //===----------------------------------------------------------------------===// // ASTReader Implementation //===----------------------------------------------------------------------===// @@ -4512,49 +4484,6 @@ void ASTReader::loadPendingDeclChain(Decl *FirstLocal, uint64_t LocalOffset) { ASTDeclReader::attachLatestDecl(CanonDecl, MostRecent); } -void ASTReader::loadDeferredAttribute(const DeferredAttribute &DA) { - Decl *D = DA.TargetedDecl; - ModuleFile *M = getOwningModuleFile(D); - - unsigned LocalDeclIndex = D->getGlobalID().getLocalDeclIndex(); - const DeclOffset &DOffs = M->DeclOffsets[LocalDeclIndex]; - RecordLocation Loc(M, DOffs.getBitOffset(M->DeclsBlockStartOffset)); - - llvm::BitstreamCursor &Cursor = Loc.F->DeclsCursor; - SavedStreamPosition SavedPosition(Cursor); - if (llvm::Error Err = Cursor.JumpToBit(Loc.Offset)) { - Error(std::move(Err)); - } - - Expected MaybeCode = Cursor.ReadCode(); - if (!MaybeCode) { - llvm::report_fatal_error( - Twine("ASTReader::loadPreferredNameAttribute failed reading code: ") + - toString(MaybeCode.takeError())); - } - unsigned Code = MaybeCode.get(); - - ASTRecordReader Record(*this, *Loc.F); - Expected MaybeRecCode = Record.readRecord(Cursor, Code); - if (!MaybeRecCode) { - llvm::report_fatal_error( - Twine( - "ASTReader::loadPreferredNameAttribute failed reading rec code: ") + - toString(MaybeCode.takeError())); - } - unsigned RecCode = MaybeRecCode.get(); - if (RecCode < DECL_TYPEDEF || RecCode > DECL_LAST) { - llvm::report_fatal_error( - Twine("ASTReader::loadPreferredNameAttribute failed reading rec code: " - "expected valid DeclCode") + - toString(MaybeCode.takeError())); - } - - Record.skipInts(DA.RecordIdx); - Attr *A = Record.readAttr(); - getContext().getDeclAttrs(D).push_back(A); -} - namespace { /// Given an ObjC interface, goes through the modules and links to the diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 066c4b1533552a..a580f375aee354 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -37,7 +37,6 @@ #include "clang/AST/Type.h" #include "clang/AST/TypeLoc.h" #include "clang/AST/TypeLocVisitor.h" -#include "clang/Basic/AttrKinds.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/FileEntry.h" @@ -5156,14 +5155,15 @@ void ASTWriter::WriteModuleFileExtension(Sema &SemaRef, void ASTRecordWriter::AddAttr(const Attr *A) { auto &Record = *this; - if (!A) + // FIXME: Clang can't handle the serialization/deserialization of + // preferred_name properly now. See + // https://github.com/llvm/llvm-project/issues/56490 for example. + if (!A || (isa(A) && + Writer->isWritingStdCXXNamedModules())) return Record.push_back(0); Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs - auto SkipIdx = Record.size(); - // Add placeholder for the size of deferred attribute. - Record.push_back(0); Record.AddIdentifierRef(A->getAttrName()); Record.AddIdentifierRef(A->getScopeName()); Record.AddSourceRange(A->getRange()); @@ -5174,12 +5174,6 @@ void ASTRecordWriter::AddAttr(const Attr *A) { Record.push_back(A->isRegularKeywordAttribute()); #include "clang/Serialization/AttrPCHWrite.inc" - - if (A->shouldDeferDeserialization()) { - // Record the actual size of deferred attribute (+ 1 to count the attribute - // kind). - Record[SkipIdx] = Record.size() - SkipIdx + 1; - } } /// Emit the list of attributes to the specified record. diff --git a/clang/test/Modules/preferred_name.cppm b/clang/test/Modules/preferred_name.cppm index 86ba6ae96db998..806781a81c5ca7 100644 --- a/clang/test/Modules/preferred_name.cppm +++ b/clang/test/Modules/preferred_name.cppm @@ -53,16 +53,10 @@ import A; export using ::foo_templ; //--- Use1.cpp -// expected-no-diagnostics -import A; -#include "foo.h" +import A; // expected-warning@foo.h:8 {{attribute declaration must precede definition}} +#include "foo.h" // expected-note@foo.h:9 {{previous definition is here}} + //--- Use2.cpp // expected-no-diagnostics #include "foo.h" import A; - -//--- Use3.cpp -#include "foo.h" -import A; -foo test; -int size = test.size(); // expected-error {{no member named 'size' in 'foo'}} diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 41730eba32ce27..cc6a8eaebd44ec 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -3043,10 +3043,6 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS, << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true" : "false"); } - if (R.getValueAsBit("DeferDeserialization")) { - OS << ", " - << "/*DeferDeserialization=*/true"; - } OS << ")\n"; for (auto const &ai : Args) { From 16298e4cf23d351fcd789fd027d8a30d9329fa81 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 22 Jan 2025 16:54:14 +0000 Subject: [PATCH 006/208] [X86] var-permute-256.ll - regenerate VPTERNLOG comments --- llvm/test/CodeGen/X86/var-permute-256.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 2968c97a1355ce..7296cc27894c35 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -402,7 +402,7 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512VLDQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm0 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm3 ^ ymm2)) ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v16i16: @@ -545,7 +545,7 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm0, %ymm1 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3)) ; AVX512VLDQ-NEXT: vpandn %ymm1, %ymm2, %ymm0 ; AVX512VLDQ-NEXT: retq ; @@ -675,7 +675,7 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3)) ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v32i8: @@ -857,7 +857,7 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm0, %ymm1 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3)) ; AVX512VLDQ-NEXT: vpandn %ymm1, %ymm2, %ymm0 ; AVX512VLDQ-NEXT: retq ; @@ -1502,7 +1502,7 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512VLDQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm0 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm3 ^ ymm2)) ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16: @@ -1618,7 +1618,7 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3)) ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8: From 603529b29eb5441fe7d32f8d154a0ed876038ef2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 22 Jan 2025 16:56:31 +0000 Subject: [PATCH 007/208] [X86] add/sub signed sat vec tests - regenerate VPTERNLOG comments --- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 2 +- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index b2b242fa29818f..322acd76e12e63 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -567,7 +567,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 64aead70415759..ac8b561abf0033 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -567,7 +567,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) @@ -601,7 +601,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; ; AVX512BW-LABEL: v16i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpternlogd $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ mem) ; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z From e6c7d6a56a850228ccb7b1659e383dd5a55e7bdb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 22 Jan 2025 16:57:12 +0000 Subject: [PATCH 008/208] [X86] avx512-broadcast-unfold.ll - regenerate VPTERNLOG comments --- llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index d7ecbb41e3b149..ba2cacc087b36b 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -4402,7 +4402,7 @@ define void @bcast_unfold_vpternlog_v16i32(ptr %arg, ptr %arg1) { ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 ; CHECK-NEXT: vmovdqu64 4096(%rsi,%rax), %zmm2 ; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm3 -; CHECK-NEXT: vpternlogd $216, %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1)) ; CHECK-NEXT: vpmulld %zmm3, %zmm2, %zmm1 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax From bb754f2c98ddeeadf75f21e6fbc6bd03898f008c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 22 Jan 2025 16:58:33 +0000 Subject: [PATCH 009/208] [X86] avx512 intrinsics tests - regenerate VPTERNLOG comments --- .../X86/avx512-intrinsics-fast-isel.ll | 36 ++++++++--------- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 20 +++++----- .../X86/avx512vl-intrinsics-fast-isel.ll | 40 +++++++++---------- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 1ca870add95b5d..a8574c0b7516c1 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -2218,7 +2218,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { ; CHECK-LABEL: test_mm512_ternarylogic_epi32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 & ~(zmm0 | zmm2) ; CHECK-NEXT: ret{{[l|q]}} entry: %0 = bitcast <8 x i64> %__A to <16 x i32> @@ -2236,13 +2236,13 @@ define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext ; X86: # %bb.0: # %entry ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} = zmm1 & ~(zmm0 | zmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_mask_ternarylogic_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} = zmm1 & ~(zmm0 | zmm2) ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__A to <16 x i32> @@ -2260,13 +2260,13 @@ define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64 ; X86: # %bb.0: # %entry ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = zmm1 & ~(zmm0 | zmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = zmm1 & ~(zmm0 | zmm2) ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__A to <16 x i32> @@ -2282,7 +2282,7 @@ entry: define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { ; CHECK-LABEL: test_mm512_ternarylogic_epi64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 & ~(zmm0 | zmm2) ; CHECK-NEXT: ret{{[l|q]}} entry: %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) @@ -2296,13 +2296,13 @@ define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} = zmm1 & ~(zmm0 | zmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_mask_ternarylogic_epi64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} = zmm1 & ~(zmm0 | zmm2) ; X64-NEXT: retq entry: %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) @@ -2316,13 +2316,13 @@ define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = zmm1 & ~(zmm0 | zmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = zmm1 & ~(zmm0 | zmm2) ; X64-NEXT: retq entry: %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) @@ -6864,7 +6864,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X86-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -6880,7 +6880,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-LABEL: test_mm512_mask_reduce_and_epi64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X64-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -7200,7 +7200,7 @@ define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X86-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -7217,7 +7217,7 @@ define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X64-LABEL: test_mm512_mask_reduce_and_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X64-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -8176,7 +8176,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X86-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 @@ -8192,7 +8192,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-LABEL: test_mm512_mask_reduce_min_epu64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X64-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 @@ -8778,7 +8778,7 @@ define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X86-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0 @@ -8795,7 +8795,7 @@ define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) { ; X64-LABEL: test_mm512_mask_reduce_min_epu32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X64-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index b77c753107a6e1..926af4e9957afb 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -5008,7 +5008,7 @@ declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 = ~(zmm1 | (zmm0 ^ zmm2)) ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) ret <16 x i32> %1 @@ -5018,13 +5018,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} = ~(zmm1 | (zmm0 ^ zmm2)) ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} = ~(zmm1 | (zmm0 ^ zmm2)) ; X86-NEXT: retl %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) %2 = bitcast i16 %x4 to <16 x i1> @@ -5036,13 +5036,13 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = ~(zmm1 | (zmm0 ^ zmm2)) ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = ~(zmm1 | (zmm0 ^ zmm2)) ; X86-NEXT: retl %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) %2 = bitcast i16 %x4 to <16 x i1> @@ -5055,7 +5055,7 @@ declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64 define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm1 | (zmm0 ^ zmm2)) ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) ret <8 x i64> %1 @@ -5065,14 +5065,14 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} = ~(zmm1 | (zmm0 ^ zmm2)) ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} = ~(zmm1 | (zmm0 ^ zmm2)) ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -5084,14 +5084,14 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = ~(zmm1 | (zmm0 ^ zmm2)) ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = ~(zmm1 | (zmm0 ^ zmm2)) ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index 87799c1e82fed3..1a60644b2fc228 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -3457,7 +3457,7 @@ entry: define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { ; CHECK-LABEL: test_mm_ternarylogic_epi32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & ~(xmm0 | xmm2) ; CHECK-NEXT: ret{{[l|q]}} entry: %0 = bitcast <2 x i64> %__A to <4 x i32> @@ -3475,13 +3475,13 @@ define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__ ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} +; X86-NEXT: vpternlogd {{.*#+}} xmm0 {%k1} = xmm1 & ~(xmm0 | xmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_ternarylogic_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} +; X64-NEXT: vpternlogd {{.*#+}} xmm0 {%k1} = xmm1 & ~(xmm0 | xmm2) ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__A to <4 x i32> @@ -3500,13 +3500,13 @@ define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %_ ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} +; X86-NEXT: vpternlogd {{.*#+}} xmm0 {%k1} {z} = xmm1 & ~(xmm0 | xmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm_maskz_ternarylogic_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: vpternlogd {{.*#+}} xmm0 {%k1} {z} = xmm1 & ~(xmm0 | xmm2) ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__A to <4 x i32> @@ -3523,7 +3523,7 @@ entry: define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { ; CHECK-LABEL: test_mm256_ternarylogic_epi32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 & ~(ymm0 | ymm2) ; CHECK-NEXT: ret{{[l|q]}} entry: %0 = bitcast <4 x i64> %__A to <8 x i32> @@ -3541,13 +3541,13 @@ define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} +; X86-NEXT: vpternlogd {{.*#+}} ymm0 {%k1} = ymm1 & ~(ymm0 | ymm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_ternarylogic_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} +; X64-NEXT: vpternlogd {{.*#+}} ymm0 {%k1} = ymm1 & ~(ymm0 | ymm2) ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__A to <8 x i32> @@ -3565,13 +3565,13 @@ define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} +; X86-NEXT: vpternlogd {{.*#+}} ymm0 {%k1} {z} = ymm1 & ~(ymm0 | ymm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_ternarylogic_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: vpternlogd {{.*#+}} ymm0 {%k1} {z} = ymm1 & ~(ymm0 | ymm2) ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__A to <8 x i32> @@ -3587,7 +3587,7 @@ entry: define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { ; CHECK-LABEL: test_mm_ternarylogic_epi64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 & ~(xmm0 | xmm2) ; CHECK-NEXT: ret{{[l|q]}} entry: %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) @@ -3601,13 +3601,13 @@ define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__ ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} +; X86-NEXT: vpternlogq {{.*#+}} xmm0 {%k1} = xmm1 & ~(xmm0 | xmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_ternarylogic_epi64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} +; X64-NEXT: vpternlogq {{.*#+}} xmm0 {%k1} = xmm1 & ~(xmm0 | xmm2) ; X64-NEXT: retq entry: %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) @@ -3622,13 +3622,13 @@ define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %_ ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} +; X86-NEXT: vpternlogq {{.*#+}} xmm0 {%k1} {z} = xmm1 & ~(xmm0 | xmm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm_maskz_ternarylogic_epi64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: vpternlogq {{.*#+}} xmm0 {%k1} {z} = xmm1 & ~(xmm0 | xmm2) ; X64-NEXT: retq entry: %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) @@ -3641,7 +3641,7 @@ entry: define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { ; CHECK-LABEL: test_mm256_ternarylogic_epi64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 & ~(ymm0 | ymm2) ; CHECK-NEXT: ret{{[l|q]}} entry: %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) @@ -3655,13 +3655,13 @@ define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} +; X86-NEXT: vpternlogq {{.*#+}} ymm0 {%k1} = ymm1 & ~(ymm0 | ymm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_ternarylogic_epi64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} +; X64-NEXT: vpternlogq {{.*#+}} ymm0 {%k1} = ymm1 & ~(ymm0 | ymm2) ; X64-NEXT: retq entry: %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) @@ -3676,13 +3676,13 @@ define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} +; X86-NEXT: vpternlogq {{.*#+}} ymm0 {%k1} {z} = ymm1 & ~(ymm0 | ymm2) ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_ternarylogic_epi64: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: vpternlogq {{.*#+}} ymm0 {%k1} {z} = ymm1 & ~(ymm0 | ymm2) ; X64-NEXT: retq entry: %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) From a25f2cb3e6953691fade076c8e0ccebf1016d3d9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 22 Jan 2025 17:02:24 +0000 Subject: [PATCH 010/208] [X86] vector rotate tests - regenerate VPTERNLOG comments --- llvm/test/CodeGen/X86/vector-rotate-128.ll | 12 ++--- llvm/test/CodeGen/X86/vector-rotate-256.ll | 22 ++++----- llvm/test/CodeGen/X86/vector-rotate-512.ll | 52 +++++++++++----------- 3 files changed, 43 insertions(+), 43 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index f9b903406e30fd..b114cba14cb6c7 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1581,7 +1581,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512NOVLX-NEXT: vzeroupper ; AVX512NOVLX-NEXT: retq @@ -1590,7 +1590,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLX-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1)) ; AVX512VLX-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v16i8: @@ -1739,7 +1739,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16: @@ -1754,7 +1754,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1) ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16: @@ -1819,7 +1819,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512NOVLX-NEXT: vzeroupper ; AVX512NOVLX-NEXT: retq @@ -1828,7 +1828,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLX-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1)) ; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index e54d235973c79b..86c4d79a28c891 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -391,12 +391,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -411,17 +411,17 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem) ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1402,7 +1402,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512NOVLX-NEXT: retq ; @@ -1410,7 +1410,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512VLX-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v32i8: @@ -1576,7 +1576,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16: @@ -1591,7 +1591,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1) ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16: @@ -1665,7 +1665,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512NOVLX-NEXT: retq ; @@ -1673,7 +1673,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 29afbf4c62ef5a..8ac0b178a16dfe 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -136,14 +136,14 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4)) ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -155,12 +155,12 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3)) ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -178,35 +178,35 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3)) ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm6 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4)) ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm8) ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3)) ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3)) ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm3, %ymm4 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm3 & ymm8) ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -754,7 +754,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_v64i8: @@ -766,35 +766,35 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, @@ -844,7 +844,7 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: @@ -856,21 +856,21 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16: @@ -902,7 +902,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; @@ -915,7 +915,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -923,7 +923,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -931,7 +931,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; @@ -939,7 +939,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -947,7 +947,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, From 44f316811016e677ca3e6c6237619e71bae28986 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 22 Jan 2025 17:03:59 +0000 Subject: [PATCH 011/208] [X86] vector reduction tests - regenerate VPTERNLOG comments --- .../CodeGen/X86/vector-reduce-and-bool.ll | 8 ++--- .../test/CodeGen/X86/vector-reduce-and-cmp.ll | 16 +++++----- .../CodeGen/X86/vector-reduce-and-scalar.ll | 6 ++-- llvm/test/CodeGen/X86/vector-reduce-umax.ll | 32 +++++++++---------- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index 49cd4d20d166a7..f434fc8c6cad80 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -1622,7 +1622,7 @@ define i1 @icmp1_v8i64_v8i1(<8 x i64>) nounwind { ; ; AVX512-LABEL: icmp1_v8i64_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -1695,7 +1695,7 @@ define i1 @icmp1_v16i32_v16i1(<16 x i32>) nounwind { ; ; AVX512-LABEL: icmp1_v16i32_v16i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -1768,7 +1768,7 @@ define i1 @icmp1_v32i16_v32i1(<32 x i16>) nounwind { ; ; AVX512-LABEL: icmp1_v32i16_v32i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -1841,7 +1841,7 @@ define i1 @icmp1_v64i8_v64i1(<64 x i8>) nounwind { ; ; AVX512-LABEL: icmp1_v64i8_v64i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll index d4d79ddbd589f9..57ab56b6494b50 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -131,7 +131,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; ; AVX512-LABEL: test_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -199,7 +199,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; AVX512-LABEL: test_v16i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al @@ -353,7 +353,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al @@ -421,7 +421,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -594,7 +594,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -662,7 +662,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; AVX512-LABEL: test_v64i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al @@ -854,7 +854,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al @@ -922,7 +922,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; AVX512-LABEL: test_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll b/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll index 3a3824260140f2..5317f7ccc588b5 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll @@ -154,7 +154,7 @@ define i1 @test_v8i64(ptr %ptr) nounwind { ; ; AVX512-LABEL: test_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = -1 ; AVX512-NEXT: vpcmpneqd (%rdi), %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -250,7 +250,7 @@ define i1 @test_v16i64(ptr %ptr) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpandq 64(%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al @@ -471,7 +471,7 @@ define i1 @test_v16i32(ptr %ptr) nounwind { ; ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = -1 ; AVX512-NEXT: vpcmpneqd (%rdi), %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 3b25a6e033f2fd..b355c3dee53098 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -1415,7 +1415,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; AVX512BW-LABEL: test_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: notl %eax @@ -1425,7 +1425,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX512VL-LABEL: test_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: notl %eax @@ -1495,7 +1495,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: notl %eax @@ -1507,7 +1507,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: notl %eax @@ -1590,7 +1590,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: notl %eax @@ -1604,7 +1604,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: notl %eax @@ -1708,7 +1708,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: notl %eax @@ -1723,7 +1723,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: notl %eax @@ -1890,7 +1890,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; AVX512BW-LABEL: test_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 @@ -1902,7 +1902,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512VL-LABEL: test_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 @@ -1980,7 +1980,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 @@ -1994,7 +1994,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 @@ -2083,7 +2083,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 @@ -2099,7 +2099,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 @@ -2205,7 +2205,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 @@ -2222,7 +2222,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 From 2476417232cdf2e1fce1a1df466b0995cdf559c5 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Wed, 22 Jan 2025 09:32:12 -0800 Subject: [PATCH 012/208] Reapply "[sanitizer][NFCI] Add Options parameter to LowerAllowCheckPass" (#122833) (#122994) This reverts commit 1515caf7a59dc20cb932b724b2ef5c1d1a593427 (https://github.com/llvm/llvm-project/pull/122833) i.e., relands 7d8b4eb0ead277f41ff69525ed807f9f6e227f37 (https://github.com/llvm/llvm-project/pull/122765), with LowerAllowCheckPass::Options moved inside the callback to fix a stack use-after-scope error. --------- Co-authored-by: Vitaly Buka --- clang/lib/CodeGen/BackendUtil.cpp | 9 +++++---- .../Instrumentation/LowerAllowCheckPass.h | 9 +++++++++ llvm/lib/Passes/PassBuilder.cpp | 15 +++++++++++++++ llvm/lib/Passes/PassRegistry.def | 5 ++++- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 3951ad01497cca..f60f8672e6a0b8 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -796,10 +796,11 @@ static void addSanitizers(const Triple &TargetTriple, if (LowerAllowCheckPass::IsRequested()) { // We want to call it after inline, which is about OptimizerEarlyEPCallback. - PB.registerOptimizerEarlyEPCallback([](ModulePassManager &MPM, - OptimizationLevel Level, - ThinOrFullLTOPhase Phase) { - MPM.addPass(createModuleToFunctionPassAdaptor(LowerAllowCheckPass())); + PB.registerOptimizerEarlyEPCallback([&](ModulePassManager &MPM, + OptimizationLevel Level, + ThinOrFullLTOPhase Phase) { + LowerAllowCheckPass::Options Opts; + MPM.addPass(createModuleToFunctionPassAdaptor(LowerAllowCheckPass(Opts))); }); } } diff --git a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h index af974818fec5f3..3ee907606e12b8 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h +++ b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h @@ -24,9 +24,18 @@ namespace llvm { // from the hot code. class LowerAllowCheckPass : public PassInfoMixin { public: + struct Options { + std::vector placeholder; // TODO: cutoffs + }; + + explicit LowerAllowCheckPass(LowerAllowCheckPass::Options Opts) + : Opts(std::move(Opts)) {}; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); static bool IsRequested(); + +private: + LowerAllowCheckPass::Options Opts; }; } // namespace llvm diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f698a3df08ef78..1e97cef22045d4 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -821,6 +821,21 @@ Expected parseEmbedBitcodePassOptions(StringRef Params) { return Result; } +Expected +parseLowerAllowCheckPassOptions(StringRef Params) { + LowerAllowCheckPass::Options Result; + while (!Params.empty()) { + StringRef ParamName; + std::tie(ParamName, Params) = Params.split(';'); + + return make_error( + formatv("invalid LowerAllowCheck pass parameter '{0}' ", ParamName) + .str(), + inconvertibleErrorCode()); + } + return Result; +} + Expected parseMSanPassOptions(StringRef Params) { MemorySanitizerOptions Result; while (!Params.empty()) { diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index a93a995655a147..0eb050c8adb047 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -402,7 +402,6 @@ FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass()) FUNCTION_PASS("loop-simplify", LoopSimplifyPass()) FUNCTION_PASS("loop-sink", LoopSinkPass()) FUNCTION_PASS("loop-versioning", LoopVersioningPass()) -FUNCTION_PASS("lower-allow-check", LowerAllowCheckPass()) FUNCTION_PASS("lower-atomic", LowerAtomicPass()) FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass()) FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass()) @@ -553,6 +552,10 @@ FUNCTION_PASS_WITH_PARAMS( parseLoopVectorizeOptions, "no-interleave-forced-only;interleave-forced-only;no-vectorize-forced-only;" "vectorize-forced-only") +FUNCTION_PASS_WITH_PARAMS( + "lower-allow-check", "LowerAllowCheckPass", + [](LowerAllowCheckPass::Options Opts) { return LowerAllowCheckPass(Opts); }, + parseLowerAllowCheckPassOptions, "") FUNCTION_PASS_WITH_PARAMS( "lower-matrix-intrinsics", "LowerMatrixIntrinsicsPass", [](bool Minimal) { return LowerMatrixIntrinsicsPass(Minimal); }, From f3c3a9b8829760b730b6651e460f9035065dd4c5 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 22 Jan 2025 09:33:08 -0800 Subject: [PATCH 013/208] [libc][cmake] error if user disables sanitizers but wants scudo (#123834) I found this out the hard way...though we don't suggest in our docs setting or unsetting COMPILER_RT_BUILD_SANITIZERS, I had this explicitly disabled in a cmake script I was using to setup an llvm-libc based sysroot. While the libc compiled, hello world failed to link due to missing references to malloc at link time. Though I had set the cmake variables to opt into using scudo, apparently explicitly disabling sanitizers will still prevent scudo from being built... Check for this at configure time and stop the build then. --- libc/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 6f1c180a3f192e..e5ac842edf56ec 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -323,6 +323,9 @@ if(LLVM_LIBC_INCLUDE_SCUDO) if (NOT ("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS OR "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES)) message(FATAL_ERROR "SCUDO cannot be included without adding compiler-rt to LLVM_ENABLE_PROJECTS or LLVM_ENABLE_RUNTIMES") endif() + if (DEFINED COMPILER_RT_BUILD_SANITIZERS AND NOT COMPILER_RT_BUILD_SANITIZERS) + message(FATAL_ERROR "Disabling COMPILER_RT_BUILD_SANITIZERS will produce a libc without malloc/free") + endif() endif() option(LIBC_INCLUDE_DOCS "Build the libc documentation." ${LLVM_INCLUDE_DOCS}) From ddb8607fe8b0b74a6d89c79d4fcc158673ac765a Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 22 Jan 2025 09:33:53 -0800 Subject: [PATCH 014/208] [libc][docs] disable pthreads docs (#123824) Having a target named pthreads is breaking when multiple runtimes are enabled. Disable this target for now so that the builds go back to green (and sites get updated). Link: https://github.com/llvm/llvm-zorg/issues/359#issuecomment-2600285688 Link: #122006 Link: #122497 Link: #123821 --- libc/docs/CMakeLists.txt | 3 ++- libc/docs/headers/index.rst | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt index f88d7c27f9f6b3..04eaa7f1b8a5dc 100644 --- a/libc/docs/CMakeLists.txt +++ b/libc/docs/CMakeLists.txt @@ -45,7 +45,8 @@ if (SPHINX_FOUND) locale net/if netinet/in - pthread + # TODO: https://github.com/llvm/llvm-project/issues/123821 + # pthread setjmp signal stdbit diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst index 858b2142defa92..4a66d68ed902d8 100644 --- a/libc/docs/headers/index.rst +++ b/libc/docs/headers/index.rst @@ -17,7 +17,6 @@ Implementation Status math/index.rst net/if netinet/in - pthread search setjmp signal @@ -37,3 +36,6 @@ Implementation Status uchar wchar wctype +.. + TODO: https://github.com/llvm/llvm-project/issues/123821 + pthread From 8e79ade49d68c49aeb8ba008b59f559b86d22765 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 22 Jan 2025 09:34:59 -0800 Subject: [PATCH 015/208] [libc][LIBC_ADD_NULL_CHECKS] replace volatile deref with __builtin_trap (#123401) Also, update the unit tests that were checking for SIGSEGV to not check for a specific signal. To further improve this check, it may be worth: - renaming the configuration option/macro/docs to be clearer about intent. - swap __builtin_trap for __builtin_unreachable, removing the preprocessor variants of LIBC_CRASH_ON_NULLPTR, then unconditionally using `-fsanitize=unreachable -fsanitize-trap=unreachable` in cmake flags when LIBC_ADD_NULL_CHECKS is enabled. - building with `-fno-delete-null-pointer-checks` when LIBC_ADD_NULL_CHECKS (or when some larger yet to be added hardening config) is enabled. Link: #111546 --- libc/src/__support/macros/null_check.h | 9 ++------- libc/test/src/math/smoke/nan_test.cpp | 4 ++-- libc/test/src/math/smoke/nanf128_test.cpp | 4 ++-- libc/test/src/math/smoke/nanf16_test.cpp | 4 ++-- libc/test/src/math/smoke/nanf_test.cpp | 4 ++-- libc/test/src/math/smoke/nanl_test.cpp | 4 ++-- 6 files changed, 12 insertions(+), 17 deletions(-) diff --git a/libc/src/__support/macros/null_check.h b/libc/src/__support/macros/null_check.h index 400f7d809db4fa..eda19f889235e4 100644 --- a/libc/src/__support/macros/null_check.h +++ b/libc/src/__support/macros/null_check.h @@ -14,15 +14,10 @@ #include "src/__support/macros/sanitizer.h" #if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) -// Use volatile to prevent undefined behavior of dereferencing nullptr. -// Intentionally crashing with SIGSEGV. -#define LIBC_CRASH_ON_NULLPTR(PTR) \ +#define LIBC_CRASH_ON_NULLPTR(ptr) \ do { \ - if (LIBC_UNLIKELY(PTR == nullptr)) { \ - volatile auto *crashing = PTR; \ - [[maybe_unused]] volatile auto crash = *crashing; \ + if (LIBC_UNLIKELY((ptr) == nullptr)) \ __builtin_trap(); \ - } \ } while (0) #else #define LIBC_CRASH_ON_NULLPTR(ptr) \ diff --git a/libc/test/src/math/smoke/nan_test.cpp b/libc/test/src/math/smoke/nan_test.cpp index da6beb94c7f05d..e45e2e6d499a2b 100644 --- a/libc/test/src/math/smoke/nan_test.cpp +++ b/libc/test/src/math/smoke/nan_test.cpp @@ -44,8 +44,8 @@ TEST_F(LlvmLibcNanTest, RandomString) { run_test("123 ", 0x7ff8000000000000); } -#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) TEST_F(LlvmLibcNanTest, InvalidInput) { - EXPECT_DEATH([] { LIBC_NAMESPACE::nan(nullptr); }, WITH_SIGNAL(SIGSEGV)); + EXPECT_DEATH([] { LIBC_NAMESPACE::nan(nullptr); }); } #endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp index dd1986f17b9785..aa59b79aac9d80 100644 --- a/libc/test/src/math/smoke/nanf128_test.cpp +++ b/libc/test/src/math/smoke/nanf128_test.cpp @@ -55,8 +55,8 @@ TEST_F(LlvmLibcNanf128Test, RandomString) { QUIET_NAN); } -#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) TEST_F(LlvmLibcNanf128Test, InvalidInput) { - EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV)); + EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }); } #endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf16_test.cpp b/libc/test/src/math/smoke/nanf16_test.cpp index 5fafb1a36e4cdc..04a8c7bb5d9338 100644 --- a/libc/test/src/math/smoke/nanf16_test.cpp +++ b/libc/test/src/math/smoke/nanf16_test.cpp @@ -43,8 +43,8 @@ TEST_F(LlvmLibcNanf16Test, RandomString) { run_test("123 ", 0x7e00); } -#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) TEST_F(LlvmLibcNanf16Test, InvalidInput) { - EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }, WITH_SIGNAL(SIGSEGV)); + EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }); } #endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf_test.cpp b/libc/test/src/math/smoke/nanf_test.cpp index 19d94b40b5ffbd..40e90c48d8cda7 100644 --- a/libc/test/src/math/smoke/nanf_test.cpp +++ b/libc/test/src/math/smoke/nanf_test.cpp @@ -43,8 +43,8 @@ TEST_F(LlvmLibcNanfTest, RandomString) { run_test("123 ", 0x7fc00000); } -#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) TEST_F(LlvmLibcNanfTest, InvalidInput) { - EXPECT_DEATH([] { LIBC_NAMESPACE::nanf(nullptr); }, WITH_SIGNAL(SIGSEGV)); + EXPECT_DEATH([] { LIBC_NAMESPACE::nanf(nullptr); }); } #endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanl_test.cpp b/libc/test/src/math/smoke/nanl_test.cpp index c7217928e943b0..dea969fd3d2adc 100644 --- a/libc/test/src/math/smoke/nanl_test.cpp +++ b/libc/test/src/math/smoke/nanl_test.cpp @@ -71,8 +71,8 @@ TEST_F(LlvmLibcNanlTest, RandomString) { run_test("123 ", expected); } -#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) TEST_F(LlvmLibcNanlTest, InvalidInput) { - EXPECT_DEATH([] { LIBC_NAMESPACE::nanl(nullptr); }, WITH_SIGNAL(SIGSEGV)); + EXPECT_DEATH([] { LIBC_NAMESPACE::nanl(nullptr); }); } #endif // LIBC_HAS_ADDRESS_SANITIZER From f78359cf43cb990f66412059383cdd95ab6e6ec4 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Wed, 22 Jan 2025 17:45:23 +0000 Subject: [PATCH 016/208] [mlir][spirv] Add definition for OpEmitVertex and OpEndPrimitive (#123759) This is hopefully the first patch in the series of patches adding some missing SPIR-V ops to MLIR over the next weeks/months, starting with something simple: `OpEmitVertex` and `OpEndPrimitive`. Since the ops have no input and outputs, and the only condition is "This instruction must only be used when only one stream is present.", which I don't think can be validate at the instruction level in isolation, I set `hasVerifier` to 0. I hope I didn't miss anything, but I'm more than happy to address any comments. --- .../mlir/Dialect/SPIRV/IR/SPIRVBase.td | 10 ++- .../include/mlir/Dialect/SPIRV/IR/SPIRVOps.td | 1 + .../Dialect/SPIRV/IR/SPIRVPrimitiveOps.td | 81 +++++++++++++++++++ mlir/test/Dialect/SPIRV/IR/availability.mlir | 22 +++++ mlir/test/Dialect/SPIRV/IR/primitive-ops.mlir | 21 +++++ mlir/test/Target/SPIRV/primitive-ops.mlir | 17 ++++ 6 files changed, 148 insertions(+), 4 deletions(-) create mode 100755 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVPrimitiveOps.td create mode 100644 mlir/test/Dialect/SPIRV/IR/primitive-ops.mlir create mode 100644 mlir/test/Target/SPIRV/primitive-ops.mlir diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td index 469a9a0ef01dd2..c84677d26a8b69 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -4438,6 +4438,8 @@ def SPIRV_OC_OpBitFieldSExtract : I32EnumAttrCase<"OpBitFieldSExtrac def SPIRV_OC_OpBitFieldUExtract : I32EnumAttrCase<"OpBitFieldUExtract", 203>; def SPIRV_OC_OpBitReverse : I32EnumAttrCase<"OpBitReverse", 204>; def SPIRV_OC_OpBitCount : I32EnumAttrCase<"OpBitCount", 205>; +def SPIRV_OC_OpEmitVertex : I32EnumAttrCase<"OpEmitVertex", 218>; +def SPIRV_OC_OpEndPrimitive : I32EnumAttrCase<"OpEndPrimitive", 219>; def SPIRV_OC_OpControlBarrier : I32EnumAttrCase<"OpControlBarrier", 224>; def SPIRV_OC_OpMemoryBarrier : I32EnumAttrCase<"OpMemoryBarrier", 225>; def SPIRV_OC_OpAtomicExchange : I32EnumAttrCase<"OpAtomicExchange", 229>; @@ -4576,7 +4578,8 @@ def SPIRV_OpcodeAttr : SPIRV_OC_OpBitwiseOr, SPIRV_OC_OpBitwiseXor, SPIRV_OC_OpBitwiseAnd, SPIRV_OC_OpNot, SPIRV_OC_OpBitFieldInsert, SPIRV_OC_OpBitFieldSExtract, SPIRV_OC_OpBitFieldUExtract, SPIRV_OC_OpBitReverse, SPIRV_OC_OpBitCount, - SPIRV_OC_OpControlBarrier, SPIRV_OC_OpMemoryBarrier, SPIRV_OC_OpAtomicExchange, + SPIRV_OC_OpEmitVertex, SPIRV_OC_OpEndPrimitive, SPIRV_OC_OpControlBarrier, + SPIRV_OC_OpMemoryBarrier, SPIRV_OC_OpAtomicExchange, SPIRV_OC_OpAtomicCompareExchange, SPIRV_OC_OpAtomicCompareExchangeWeak, SPIRV_OC_OpAtomicIIncrement, SPIRV_OC_OpAtomicIDecrement, SPIRV_OC_OpAtomicIAdd, SPIRV_OC_OpAtomicISub, SPIRV_OC_OpAtomicSMin, @@ -4609,9 +4612,8 @@ def SPIRV_OpcodeAttr : SPIRV_OC_OpCooperativeMatrixLengthKHR, SPIRV_OC_OpSubgroupBlockReadINTEL, SPIRV_OC_OpSubgroupBlockWriteINTEL, SPIRV_OC_OpAssumeTrueKHR, SPIRV_OC_OpAtomicFAddEXT, SPIRV_OC_OpConvertFToBF16INTEL, - SPIRV_OC_OpConvertBF16ToFINTEL, - SPIRV_OC_OpControlBarrierArriveINTEL, SPIRV_OC_OpControlBarrierWaitINTEL, - SPIRV_OC_OpGroupIMulKHR, + SPIRV_OC_OpConvertBF16ToFINTEL, SPIRV_OC_OpControlBarrierArriveINTEL, + SPIRV_OC_OpControlBarrierWaitINTEL, SPIRV_OC_OpGroupIMulKHR, SPIRV_OC_OpGroupFMulKHR ]>; diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td index 9912f195ba11e6..ff1ca89f93b5ac 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td @@ -40,6 +40,7 @@ include "mlir/Dialect/SPIRV/IR/SPIRVMatrixOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVMiscOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td" +include "mlir/Dialect/SPIRV/IR/SPIRVPrimitiveOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVCLOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td" include "mlir/Interfaces/SideEffectInterfaces.td" diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVPrimitiveOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVPrimitiveOps.td new file mode 100755 index 00000000000000..c390ae52bb7e5b --- /dev/null +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVPrimitiveOps.td @@ -0,0 +1,81 @@ +//===-- SPIRVPrimitiveOps.td - MLIR SPIR-V Primitive Ops ------*- tablegen -*------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------------------===// +// +// This file contains primitive ops for the SPIR-V dialect. It corresponds +// to "3.52.19. Primitive Instructions" of the SPIR-V specification. +// +//===-----------------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_SPIRV_PRIMITIVE_OPS +#define MLIR_DIALECT_SPIRV_PRIMITIVE_OPS + +include "mlir/Dialect/SPIRV/IR/SPIRVBase.td" + +// ----- + +def SPIRV_EmitVertexOp : SPIRV_Op<"EmitVertex", []> { + let summary = [{ + Emits the current values of all output variables to the current output + primitive. After execution, the values of all output variables are + undefined. + }]; + + let description = [{ + This instruction must only be used when only one stream is present. + + #### Example: + + ```mlir + spirv.EmitVertex + ``` + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[]>, + Capability<[SPIRV_C_Geometry]> + ]; + + let arguments = (ins); + let results = (outs); + let hasVerifier = 0; + let assemblyFormat = "attr-dict"; +} + +// ----- + +def SPIRV_EndPrimitiveOp : SPIRV_Op<"EndPrimitive", []> { + let summary = [{ + Finish the current primitive and start a new one. No vertex is emitted. + }]; + + let description = [{ + This instruction must only be used when only one stream is present. + + #### Example: + + ```mlir + spirv.EndPrimitive + ``` + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[]>, + Capability<[SPIRV_C_Geometry]> + ]; + + let arguments = (ins); + let results = (outs); + let hasVerifier = 0; + let assemblyFormat = "attr-dict"; +} + +#endif // MLIR_DIALECT_SPIRV_PRIMITIVE_OPS diff --git a/mlir/test/Dialect/SPIRV/IR/availability.mlir b/mlir/test/Dialect/SPIRV/IR/availability.mlir index c583a48eba2704..31a90ad0329d80 100644 --- a/mlir/test/Dialect/SPIRV/IR/availability.mlir +++ b/mlir/test/Dialect/SPIRV/IR/availability.mlir @@ -233,3 +233,25 @@ func.func @udot_acc_sat_vector_4xi16_i64(%a: vector<4xi16>, %acc: i64) -> i64 { %r = spirv.UDotAccSat %a, %a, %acc: vector<4xi16> -> i64 return %r: i64 } + +//===----------------------------------------------------------------------===// +// Primitive ops +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: emit_vertex +func.func @emit_vertex() -> () { + // CHECK: min version: v1.0 + // CHECK: max version: v1.6 + // CHECK: capabilities: [ [Geometry] ] + spirv.EmitVertex + return +} + +// CHECK-LABEL: end_primitive +func.func @end_primitive() -> () { + // CHECK: min version: v1.0 + // CHECK: max version: v1.6 + // CHECK: capabilities: [ [Geometry] ] + spirv.EndPrimitive + return +} diff --git a/mlir/test/Dialect/SPIRV/IR/primitive-ops.mlir b/mlir/test/Dialect/SPIRV/IR/primitive-ops.mlir new file mode 100644 index 00000000000000..451c3345b4e0d5 --- /dev/null +++ b/mlir/test/Dialect/SPIRV/IR/primitive-ops.mlir @@ -0,0 +1,21 @@ +// RUN: mlir-opt %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// spirv.EmitVertex +//===----------------------------------------------------------------------===// + +func.func @emit_vertex() { + // CHECK: spirv.EmitVertex + spirv.EmitVertex + spirv.Return +} + +//===----------------------------------------------------------------------===// +// spirv.EndPrimitive +//===----------------------------------------------------------------------===// + +func.func @end_primitive() { + // CHECK: spirv.EndPrimitive + spirv.EndPrimitive + spirv.Return +} diff --git a/mlir/test/Target/SPIRV/primitive-ops.mlir b/mlir/test/Target/SPIRV/primitive-ops.mlir new file mode 100644 index 00000000000000..63a0b1e74784aa --- /dev/null +++ b/mlir/test/Target/SPIRV/primitive-ops.mlir @@ -0,0 +1,17 @@ +// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s + +spirv.module Logical GLSL450 requires #spirv.vce { + spirv.GlobalVariable @out : !spirv.ptr, f32, !spirv.array<1 x f32>)>, Output> + spirv.func @primitive_ops() "None" { + // CHECK: spirv.EmitVertex + spirv.EmitVertex + // CHECK: spirv.EndPrimitive + spirv.EndPrimitive + spirv.Return + } + spirv.EntryPoint "Geometry" @primitive_ops, @out + spirv.ExecutionMode @primitive_ops "InputPoints" + spirv.ExecutionMode @primitive_ops "Invocations", 1 + spirv.ExecutionMode @primitive_ops "OutputLineStrip" + spirv.ExecutionMode @primitive_ops "OutputVertices", 2 +} From 511dc261ab94da7db6e67b05cdcef9dcff44798a Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 22 Jan 2025 17:47:26 +0000 Subject: [PATCH 017/208] [lldb][DWARFASTParserClang][NFCI] Factor out CV-qualifier/is_static parsing from ParseChildParameters (#123951) This patch continues simplifying `ParseChildParameters` by moving out the logic that parses the first parameter of a function DIE into a helper function. Since with GCC (and lately Clang) function declarations have `DW_AT_object_pointer`s, we should be able to check for the attribute's existence to determine if a function is static (and also deduce CV-qualifiers from it). This will be useful for cases where the object parameter is explicit (which is possible since C++23). This should be NFC. I added a FIXME to places where we assume an implicit object parameter (which will be addressed in a follow-up patch). We used to guard parsing of the CV-qualifiers of the "this" parameter with a `encoding_mask & Type::eEncodingIsPointerUID`, which is incorrect, because `eEncodingIsPointerUID` cannot be used as a bitmask directly (see https://github.com/llvm/llvm-project/issues/120856). This patch corrects this, but it should still be NFC because any parameter in C++ called "this" *is* an implicit object parameter. --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 166 +++++++++++------- .../SymbolFile/DWARF/DWARFASTParserClang.h | 8 +- 2 files changed, 109 insertions(+), 65 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 81a1375c037182..f54b7fc9cdad24 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -159,6 +159,76 @@ static bool TagIsRecordType(dw_tag_t tag) { } } +/// Get the object parameter DIE if one exists, otherwise returns +/// a default DWARFDIE. If \c containing_decl_ctx is not a valid +/// C++ declaration context for class methods, assume no object +/// parameter exists for the given \c subprogram. +static DWARFDIE +GetCXXObjectParameter(const DWARFDIE &subprogram, + const clang::DeclContext &containing_decl_ctx) { + assert(subprogram.Tag() == DW_TAG_subprogram || + subprogram.Tag() == DW_TAG_inlined_subroutine || + subprogram.Tag() == DW_TAG_subroutine_type); + + if (!DeclKindIsCXXClass(containing_decl_ctx.getDeclKind())) + return {}; + + // FIXME: if subprogram has a explicit DW_AT_object_pointer, use it. + + // If no DW_AT_object_pointer was specified, assume the implicit object + // parameter is the first parameter to the function, is called "this" and is + // artificial (which is what most compilers would generate). + auto children = subprogram.children(); + auto it = llvm::find_if(children, [](const DWARFDIE &child) { + return child.Tag() == DW_TAG_formal_parameter; + }); + + if (it == children.end()) + return {}; + + DWARFDIE object_pointer = *it; + + if (!object_pointer.GetAttributeValueAsUnsigned(DW_AT_artificial, 0)) + return {}; + + // Often times compilers omit the "this" name for the + // specification DIEs, so we can't rely upon the name being in + // the formal parameter DIE... + if (const char *name = object_pointer.GetName(); + name && ::strcmp(name, "this") != 0) + return {}; + + return object_pointer; +} + +/// In order to determine the CV-qualifiers for a C++ class +/// method in DWARF, we have to look at the CV-qualifiers of +/// the object parameter's type. +static unsigned GetCXXMethodCVQuals(const DWARFDIE &subprogram, + const DWARFDIE &object_parameter) { + if (!subprogram || !object_parameter) + return 0; + + Type *this_type = subprogram.ResolveTypeUID( + object_parameter.GetAttributeValueAsReferenceDIE(DW_AT_type)); + if (!this_type) + return 0; + + uint32_t encoding_mask = this_type->GetEncodingMask(); + + // FIXME: explicit object parameters need not to be pointers + if (!(encoding_mask & (1u << Type::eEncodingIsPointerUID))) + return 0; + + unsigned cv_quals = 0; + if (encoding_mask & (1u << Type::eEncodingIsConstUID)) + cv_quals |= clang::Qualifiers::Const; + if (encoding_mask & (1u << Type::eEncodingIsVolatileUID)) + cv_quals |= clang::Qualifiers::Volatile; + + return cv_quals; +} + TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc, const DWARFDIE &die, Log *log) { @@ -1188,11 +1258,8 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, const dw_tag_t tag = die.Tag(); bool is_variadic = false; - bool is_static = false; bool has_template_params = false; - unsigned type_quals = 0; - DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(), DW_TAG_value_to_name(tag), type_name_cstr); @@ -1215,23 +1282,15 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, DWARFDIE decl_ctx_die; clang::DeclContext *containing_decl_ctx = GetClangDeclContextContainingDIE(die, &decl_ctx_die); - const clang::Decl::Kind containing_decl_kind = - containing_decl_ctx->getDeclKind(); - - bool is_cxx_method = DeclKindIsCXXClass(containing_decl_kind); - // Start off static. This will be set to false in - // ParseChildParameters(...) if we find a "this" parameters as the - // first parameter - if (is_cxx_method) { - is_static = true; - } + assert(containing_decl_ctx); if (die.HasChildren()) { - ParseChildParameters(containing_decl_ctx, die, is_static, is_variadic, + ParseChildParameters(containing_decl_ctx, die, is_variadic, has_template_params, function_param_types, - function_param_decls, type_quals); + function_param_decls); } + bool is_cxx_method = DeclKindIsCXXClass(containing_decl_ctx->getDeclKind()); bool ignore_containing_context = false; // Check for templatized class member functions. If we had any // DW_TAG_template_type_parameter or DW_TAG_template_value_parameter @@ -1251,12 +1310,16 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, clang::CallingConv calling_convention = ConvertDWARFCallingConventionToClang(attrs); + const DWARFDIE object_parameter = + GetCXXObjectParameter(die, *containing_decl_ctx); + // clang_type will get the function prototype clang type after this // call CompilerType clang_type = m_ast.CreateFunctionType(return_clang_type, function_param_types.data(), function_param_types.size(), is_variadic, - type_quals, calling_convention, attrs.ref_qual); + GetCXXMethodCVQuals(die, object_parameter), + calling_convention, attrs.ref_qual); if (attrs.name) { bool type_handled = false; @@ -1267,6 +1330,8 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, type_handled = ParseObjCMethod(*objc_method, die, clang_type, attrs, is_variadic); } else if (is_cxx_method) { + // In DWARF, a C++ method is static if it has no object parameter child. + const bool is_static = !object_parameter.IsValid(); auto [handled, type_sp] = ParseCXXMethod(die, clang_type, attrs, decl_ctx_die, is_static, ignore_containing_context); @@ -2315,10 +2380,8 @@ size_t DWARFASTParserClang::ParseChildEnumerators( ConstString DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) { - bool is_static = false; bool is_variadic = false; bool has_template_params = false; - unsigned type_quals = 0; std::vector param_types; std::vector param_decls; StreamString sstr; @@ -2328,9 +2391,13 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) { clang::DeclContext *containing_decl_ctx = GetClangDeclContextContainingDIE(die, nullptr); - ParseChildParameters(containing_decl_ctx, die, is_static, is_variadic, - has_template_params, param_types, param_decls, - type_quals); + assert(containing_decl_ctx); + + const unsigned cv_quals = GetCXXMethodCVQuals( + die, GetCXXObjectParameter(die, *containing_decl_ctx)); + + ParseChildParameters(containing_decl_ctx, die, is_variadic, + has_template_params, param_types, param_decls); sstr << "("; for (size_t i = 0; i < param_types.size(); i++) { if (i > 0) @@ -2340,7 +2407,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) { if (is_variadic) sstr << ", ..."; sstr << ")"; - if (type_quals & clang::Qualifiers::Const) + if (cv_quals & clang::Qualifiers::Const) sstr << " const"; return ConstString(sstr.GetString()); @@ -3070,57 +3137,37 @@ bool DWARFASTParserClang::ParseChildMembers( return true; } -size_t DWARFASTParserClang::ParseChildParameters( +void DWARFASTParserClang::ParseChildParameters( clang::DeclContext *containing_decl_ctx, const DWARFDIE &parent_die, - bool &is_static, bool &is_variadic, bool &has_template_params, + bool &is_variadic, bool &has_template_params, std::vector &function_param_types, - std::vector &function_param_decls, - unsigned &type_quals) { + std::vector &function_param_decls) { if (!parent_die) - return 0; + return; - size_t arg_idx = 0; for (DWARFDIE die : parent_die.children()) { const dw_tag_t tag = die.Tag(); switch (tag) { case DW_TAG_formal_parameter: { + if (die.GetAttributeValueAsUnsigned(DW_AT_artificial, 0)) + continue; + const char *name = die.GetName(); DWARFDIE param_type_die = die.GetAttributeValueAsReferenceDIE(DW_AT_type); - if (die.GetAttributeValueAsUnsigned(DW_AT_artificial, 0)) { - // In order to determine if a C++ member function is "const" we - // have to look at the const-ness of "this"... - if (arg_idx == 0 && - DeclKindIsCXXClass(containing_decl_ctx->getDeclKind()) && - // Often times compilers omit the "this" name for the - // specification DIEs, so we can't rely upon the name being in - // the formal parameter DIE... - (name == nullptr || ::strcmp(name, "this") == 0)) { - if (Type *this_type = die.ResolveTypeUID(param_type_die)) { - uint32_t encoding_mask = this_type->GetEncodingMask(); - if (encoding_mask & Type::eEncodingIsPointerUID) { - is_static = false; - - if (encoding_mask & (1u << Type::eEncodingIsConstUID)) - type_quals |= clang::Qualifiers::Const; - if (encoding_mask & (1u << Type::eEncodingIsVolatileUID)) - type_quals |= clang::Qualifiers::Volatile; - } - } - } - } else if (Type *type = die.ResolveTypeUID(param_type_die)) { - function_param_types.push_back(type->GetForwardCompilerType()); + Type *type = die.ResolveTypeUID(param_type_die); + if (!type) + break; - clang::ParmVarDecl *param_var_decl = m_ast.CreateParameterDeclaration( - containing_decl_ctx, GetOwningClangModule(die), name, - type->GetForwardCompilerType(), clang::StorageClass::SC_None); - assert(param_var_decl); - function_param_decls.push_back(param_var_decl); + function_param_types.push_back(type->GetForwardCompilerType()); - m_ast.SetMetadataAsUserID(param_var_decl, die.GetID()); - } + clang::ParmVarDecl *param_var_decl = m_ast.CreateParameterDeclaration( + containing_decl_ctx, GetOwningClangModule(die), name, + type->GetForwardCompilerType(), clang::StorageClass::SC_None); + assert(param_var_decl); + function_param_decls.push_back(param_var_decl); - arg_idx++; + m_ast.SetMetadataAsUserID(param_var_decl, die.GetID()); } break; case DW_TAG_unspecified_parameters: @@ -3142,7 +3189,6 @@ size_t DWARFASTParserClang::ParseChildParameters( break; } } - return arg_idx; } clang::Decl *DWARFASTParserClang::GetClangDeclForDIE(const DWARFDIE &die) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index 5b1c204bbe8155..a5c3746ada4c36 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -186,14 +186,12 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { const lldb::AccessType default_accessibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info); - size_t + void ParseChildParameters(clang::DeclContext *containing_decl_ctx, const lldb_private::plugin::dwarf::DWARFDIE &parent_die, - bool &is_static, bool &is_variadic, - bool &has_template_params, + bool &is_variadic, bool &has_template_params, std::vector &function_args, - std::vector &function_param_decls, - unsigned &type_quals); + std::vector &function_param_decls); size_t ParseChildEnumerators( const lldb_private::CompilerType &compiler_type, bool is_signed, From b1943f40e74dcfe4ebd6213e1a8a01403bd5ffa9 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Wed, 22 Jan 2025 09:50:49 -0800 Subject: [PATCH 018/208] [BranchFolding] Remove getBranchDebugLoc() (#114613) --- llvm/lib/CodeGen/BranchFolding.cpp | 42 ++++++++++++------------------ 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 65476fa05a2030..29a3076b57e207 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1268,15 +1268,6 @@ static bool IsBetterFallthrough(MachineBasicBlock *MBB1, return MBB2I->isCall() && !MBB1I->isCall(); } -/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch -/// instructions on the block. -static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) { - MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); - if (I != MBB.end() && I->isBranch()) - return I->getDebugLoc(); - return DebugLoc(); -} - static void copyDebugInfoToPredecessor(const TargetInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock &PredMBB) { @@ -1403,11 +1394,11 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { // destination, remove the branch, replacing it with an unconditional one or // a fall-through. if (PriorTBB && PriorTBB == PriorFBB) { - DebugLoc dl = getBranchDebugLoc(PrevBB); + DebugLoc Dl = PrevBB.findBranchDebugLoc(); TII->removeBranch(PrevBB); PriorCond.clear(); if (PriorTBB != MBB) - TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl); + TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, Dl); MadeChange = true; ++NumBranchOpts; goto ReoptimizeBlock; @@ -1461,9 +1452,9 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { // If the prior block branches somewhere else on the condition and here if // the condition is false, remove the uncond second branch. if (PriorFBB == MBB) { - DebugLoc dl = getBranchDebugLoc(PrevBB); + DebugLoc Dl = PrevBB.findBranchDebugLoc(); TII->removeBranch(PrevBB); - TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl); + TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, Dl); MadeChange = true; ++NumBranchOpts; goto ReoptimizeBlock; @@ -1475,9 +1466,9 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { if (PriorTBB == MBB) { SmallVector NewPriorCond(PriorCond); if (!TII->reverseBranchCondition(NewPriorCond)) { - DebugLoc dl = getBranchDebugLoc(PrevBB); + DebugLoc Dl = PrevBB.findBranchDebugLoc(); TII->removeBranch(PrevBB); - TII->insertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, dl); + TII->insertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, Dl); MadeChange = true; ++NumBranchOpts; goto ReoptimizeBlock; @@ -1513,9 +1504,9 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "\nMoving MBB: " << *MBB << "To make fallthrough to: " << *PriorTBB << "\n"); - DebugLoc dl = getBranchDebugLoc(PrevBB); + DebugLoc Dl = PrevBB.findBranchDebugLoc(); TII->removeBranch(PrevBB); - TII->insertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl); + TII->insertBranch(PrevBB, MBB, nullptr, NewPriorCond, Dl); // Move this block to the end of the function. MBB->moveAfter(&MF.back()); @@ -1576,9 +1567,9 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) { SmallVector NewCond(CurCond); if (!TII->reverseBranchCondition(NewCond)) { - DebugLoc dl = getBranchDebugLoc(*MBB); + DebugLoc Dl = MBB->findBranchDebugLoc(); TII->removeBranch(*MBB); - TII->insertBranch(*MBB, CurFBB, CurTBB, NewCond, dl); + TII->insertBranch(*MBB, CurFBB, CurTBB, NewCond, Dl); MadeChange = true; ++NumBranchOpts; goto ReoptimizeBlock; @@ -1590,7 +1581,7 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { if (CurTBB && CurCond.empty() && !CurFBB && IsBranchOnlyBlock(MBB) && CurTBB != MBB && !MBB->hasAddressTaken() && !MBB->isEHPad()) { - DebugLoc dl = getBranchDebugLoc(*MBB); + DebugLoc Dl = MBB->findBranchDebugLoc(); // This block may contain just an unconditional branch. Because there can // be 'non-branch terminators' in the block, try removing the branch and // then seeing if the block is empty. @@ -1624,9 +1615,9 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { assert(!PriorFBB && "Machine CFG out of date!"); PriorFBB = MBB; } - DebugLoc pdl = getBranchDebugLoc(PrevBB); + DebugLoc PrevDl = PrevBB.findBranchDebugLoc(); TII->removeBranch(PrevBB); - TII->insertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl); + TII->insertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, PrevDl); } // Iterate through all the predecessors, revectoring each in-turn. @@ -1659,10 +1650,11 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { bool NewCurUnAnalyzable = TII->analyzeBranch( *PMBB, NewCurTBB, NewCurFBB, NewCurCond, true); if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) { - DebugLoc pdl = getBranchDebugLoc(*PMBB); + DebugLoc PrevDl = PMBB->findBranchDebugLoc(); TII->removeBranch(*PMBB); NewCurCond.clear(); - TII->insertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl); + TII->insertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, + PrevDl); MadeChange = true; ++NumBranchOpts; } @@ -1681,7 +1673,7 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { } // Add the branch back if the block is more than just an uncond branch. - TII->insertBranch(*MBB, CurTBB, nullptr, CurCond, dl); + TII->insertBranch(*MBB, CurTBB, nullptr, CurCond, Dl); } } From a56ba1fab07b2c9b1d0287bdf56cdad4d54a5f33 Mon Sep 17 00:00:00 2001 From: goldsteinn <35538541+goldsteinn@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:51:18 -0600 Subject: [PATCH 019/208] [ValueTracking] Handle recursive select/PHI in ComputeKnownBits (#114689) Finish porting #114008 to `KnownBits` (Follow up to #113707). --- llvm/lib/Analysis/ValueTracking.cpp | 73 ++++----- .../Analysis/ScalarEvolution/cycled_phis.ll | 4 +- .../Analysis/ScalarEvolution/unknown_phis.ll | 4 +- .../InstCombine/known-phi-recurse.ll | 138 ++++++++++++++++++ .../switch-branch-fold-indirectbr-102351.ll | 30 ++-- 5 files changed, 194 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 6e2f0ebde9bb6c..38f88850be0f18 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -592,6 +592,36 @@ static bool cmpExcludesZero(CmpInst::Predicate Pred, const Value *RHS) { return true; } +static void breakSelfRecursivePHI(const Use *U, const PHINode *PHI, + Value *&ValOut, Instruction *&CtxIOut) { + ValOut = U->get(); + if (ValOut == PHI) + return; + CtxIOut = PHI->getIncomingBlock(*U)->getTerminator(); + Value *V; + // If the Use is a select of this phi, compute analysis on other arm to break + // recursion. + // TODO: Min/Max + if (match(ValOut, m_Select(m_Value(), m_Specific(PHI), m_Value(V))) || + match(ValOut, m_Select(m_Value(), m_Value(V), m_Specific(PHI)))) + ValOut = V; + + // Same for select, if this phi is 2-operand phi, compute analysis on other + // incoming value to break recursion. + // TODO: We could handle any number of incoming edges as long as we only have + // two unique values. + else if (auto *IncPhi = dyn_cast(ValOut); + IncPhi && IncPhi->getNumIncomingValues() == 2) { + for (int Idx = 0; Idx < 2; ++Idx) { + if (IncPhi->getIncomingValue(Idx) == PHI) { + ValOut = IncPhi->getIncomingValue(1 - Idx); + CtxIOut = IncPhi->getIncomingBlock(1 - Idx)->getTerminator(); + break; + } + } + } +} + static bool isKnownNonZeroFromAssume(const Value *V, const SimplifyQuery &Q) { // Use of assumptions is context-sensitive. If we don't have a context, we // cannot use them! @@ -1641,25 +1671,19 @@ static void computeKnownBitsFromOperator(const Operator *I, Known.Zero.setAllBits(); Known.One.setAllBits(); - for (unsigned u = 0, e = P->getNumIncomingValues(); u < e; ++u) { - Value *IncValue = P->getIncomingValue(u); + for (const Use &U : P->operands()) { + Value *IncValue; + Instruction *CxtI; + breakSelfRecursivePHI(&U, P, IncValue, CxtI); // Skip direct self references. - if (IncValue == P) continue; - - // If the Use is a select of this phi, use the knownbit of the other - // operand to break the recursion. - if (auto *SI = dyn_cast(IncValue)) { - if (SI->getTrueValue() == P || SI->getFalseValue() == P) - IncValue = SI->getTrueValue() == P ? SI->getFalseValue() - : SI->getTrueValue(); - } + if (IncValue == P) + continue; // Change the context instruction to the "edge" that flows into the // phi. This is important because that is where the value is actually // "evaluated" even though it is used later somewhere else. (see also // D69571). - SimplifyQuery RecQ = Q.getWithoutCondContext(); - RecQ.CxtI = P->getIncomingBlock(u)->getTerminator(); + SimplifyQuery RecQ = Q.getWithoutCondContext().getWithInstruction(CxtI); Known2 = KnownBits(BitWidth); @@ -6053,30 +6077,13 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts, bool First = true; for (const Use &U : P->operands()) { - Value *IncValue = U.get(); + Value *IncValue; + Instruction *CxtI; + breakSelfRecursivePHI(&U, P, IncValue, CxtI); // Skip direct self references. if (IncValue == P) continue; - Instruction *CxtI = P->getIncomingBlock(U)->getTerminator(); - - // If the Use is a select of this phi, use the fp class of the other - // operand to break the recursion. Same around 2-operand phi nodes - Value *V; - if (match(IncValue, m_Select(m_Value(), m_Specific(P), m_Value(V))) || - match(IncValue, m_Select(m_Value(), m_Value(V), m_Specific(P)))) { - IncValue = V; - } else if (auto *IncPhi = dyn_cast(IncValue); - IncPhi && IncPhi->getNumIncomingValues() == 2) { - for (int Idx = 0; Idx < 2; ++Idx) { - if (IncPhi->getIncomingValue(Idx) == P) { - IncValue = IncPhi->getIncomingValue(1 - Idx); - CxtI = IncPhi->getIncomingBlock(1 - Idx)->getTerminator(); - break; - } - } - } - KnownFPClass KnownSrc; // Recurse, but cap the recursion to two levels, because we don't want // to waste time spinning around in loops. We need at least depth 2 to diff --git a/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll b/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll index ec244595e8fe39..478bcf94daf697 100644 --- a/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll +++ b/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll @@ -8,9 +8,9 @@ define void @test_01() { ; CHECK-LABEL: 'test_01' ; CHECK-NEXT: Classifying expressions for: @test_01 ; CHECK-NEXT: %phi_1 = phi i32 [ 10, %entry ], [ %phi_2, %loop ] -; CHECK-NEXT: --> %phi_1 U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: --> %phi_1 U: [0,31) S: [0,31) Exits: <> LoopDispositions: { %loop: Variant } ; CHECK-NEXT: %phi_2 = phi i32 [ 20, %entry ], [ %phi_1, %loop ] -; CHECK-NEXT: --> %phi_2 U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: --> %phi_2 U: [0,31) S: [0,31) Exits: <> LoopDispositions: { %loop: Variant } ; CHECK-NEXT: %cond = call i1 @cond() ; CHECK-NEXT: --> %cond U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } ; CHECK-NEXT: Determining loop execution counts for: @test_01 diff --git a/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll b/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll index bdfe38f67de0b9..c6d430f96b7de1 100644 --- a/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll +++ b/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll @@ -39,9 +39,9 @@ define void @merge_values_with_ranges_looped(ptr %a_len_ptr, ptr %b_len_ptr) { ; CHECK-NEXT: %len_b = load i32, ptr %b_len_ptr, align 4, !range !0 ; CHECK-NEXT: --> %len_b U: [0,2147483647) S: [0,2147483647) ; CHECK-NEXT: %p1 = phi i32 [ %len_a, %entry ], [ %p2, %loop ] -; CHECK-NEXT: --> %p1 U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: --> %p1 U: [0,-2147483648) S: [0,-2147483648) Exits: <> LoopDispositions: { %loop: Variant } ; CHECK-NEXT: %p2 = phi i32 [ %len_b, %entry ], [ %p1, %loop ] -; CHECK-NEXT: --> %p2 U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: --> %p2 U: [0,-2147483648) S: [0,-2147483648) Exits: <> LoopDispositions: { %loop: Variant } ; CHECK-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] ; CHECK-NEXT: --> {0,+,1}<%loop> U: [0,100) S: [0,100) Exits: 99 LoopDispositions: { %loop: Computable } ; CHECK-NEXT: %iv.next = add i32 %iv, 1 diff --git a/llvm/test/Transforms/InstCombine/known-phi-recurse.ll b/llvm/test/Transforms/InstCombine/known-phi-recurse.ll index 9008ee9ca8e061..c05cca93b035c4 100644 --- a/llvm/test/Transforms/InstCombine/known-phi-recurse.ll +++ b/llvm/test/Transforms/InstCombine/known-phi-recurse.ll @@ -256,4 +256,142 @@ exit: ret i8 %bool } +define i8 @knownbits_umax_select_test() { +; CHECK-LABEL: @knownbits_umax_select_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[CONTAIN:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[COND0:%.*]] = call i1 @cond() +; CHECK-NEXT: [[CONTAIN]] = call i8 @llvm.umax.i8(i8 [[INDVAR]], i8 1) +; CHECK-NEXT: [[COND1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[COND1]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: [[BOOL:%.*]] = and i8 [[CONTAIN]], 1 +; CHECK-NEXT: ret i8 [[BOOL]] +; +entry: + br label %loop + +loop: + %indvar = phi i8 [ 0, %entry ], [ %contain, %loop ] + %cond0 = call i1 @cond() + %contain = call i8 @llvm.umax.i8(i8 1, i8 %indvar) + %cond1 = call i1 @cond() + br i1 %cond1, label %exit, label %loop + +exit: + %bool = and i8 %contain, 1 + ret i8 %bool +} + +define i8 @knownbits_phi_phi_test() { +; CHECK-LABEL: @knownbits_phi_phi_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[CONTAIN:%.*]], [[LOOP_BB1:%.*]] ] +; CHECK-NEXT: [[COND0:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[COND0]], label [[LOOP_BB0:%.*]], label [[LOOP_BB1]] +; CHECK: loop.bb0: +; CHECK-NEXT: call void @side.effect() +; CHECK-NEXT: br label [[LOOP_BB1]] +; CHECK: loop.bb1: +; CHECK-NEXT: [[CONTAIN]] = phi i8 [ 1, [[LOOP_BB0]] ], [ [[INDVAR]], [[LOOP]] ] +; CHECK-NEXT: [[COND1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[COND1]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i8 [[CONTAIN]] +; +entry: + br label %loop + +loop: + %indvar = phi i8 [ 0, %entry ], [ %contain, %loop.bb1 ] + %cond0 = call i1 @cond() + br i1 %cond0, label %loop.bb0, label %loop.bb1 +loop.bb0: + call void @side.effect() + br label %loop.bb1 +loop.bb1: + %contain = phi i8 [ 1, %loop.bb0 ], [ %indvar, %loop ] + %cond1 = call i1 @cond() + br i1 %cond1, label %exit, label %loop + +exit: + %bool = and i8 %contain, 1 + ret i8 %bool +} + + +define i1 @known_non_zero_phi_phi_test() { +; CHECK-LABEL: @known_non_zero_phi_phi_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i8 [ 2, [[ENTRY:%.*]] ], [ [[CONTAIN:%.*]], [[LOOP_BB1:%.*]] ] +; CHECK-NEXT: [[COND0:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[COND0]], label [[LOOP_BB0:%.*]], label [[LOOP_BB1]] +; CHECK: loop.bb0: +; CHECK-NEXT: call void @side.effect() +; CHECK-NEXT: br label [[LOOP_BB1]] +; CHECK: loop.bb1: +; CHECK-NEXT: [[CONTAIN]] = phi i8 [ 1, [[LOOP_BB0]] ], [ [[INDVAR]], [[LOOP]] ] +; CHECK-NEXT: [[COND1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[COND1]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: [[BOOL:%.*]] = icmp eq i8 [[CONTAIN]], 0 +; CHECK-NEXT: ret i1 [[BOOL]] +; +entry: + br label %loop + +loop: + %indvar = phi i8 [ 2, %entry ], [ %contain, %loop.bb1 ] + %cond0 = call i1 @cond() + br i1 %cond0, label %loop.bb0, label %loop.bb1 +loop.bb0: + call void @side.effect() + br label %loop.bb1 +loop.bb1: + %contain = phi i8 [ 1, %loop.bb0 ], [ %indvar, %loop ] + %cond1 = call i1 @cond() + br i1 %cond1, label %exit, label %loop + +exit: + %bool = icmp eq i8 %contain, 0 + ret i1 %bool +} + +define i1 @known_non_zero_phi_select_test() { +; CHECK-LABEL: @known_non_zero_phi_select_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i8 [ 2, [[ENTRY:%.*]] ], [ [[CONTAIN:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[COND0:%.*]] = call i1 @cond() +; CHECK-NEXT: [[CONTAIN]] = select i1 [[COND0]], i8 1, i8 [[INDVAR]] +; CHECK-NEXT: [[COND1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[COND1]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: [[BOOL:%.*]] = icmp eq i8 [[CONTAIN]], 0 +; CHECK-NEXT: ret i1 [[BOOL]] +; +entry: + br label %loop + +loop: + %indvar = phi i8 [ 2, %entry ], [ %contain, %loop ] + %cond0 = call i1 @cond() + %contain = select i1 %cond0, i8 1, i8 %indvar + %cond1 = call i1 @cond() + br i1 %cond1, label %exit, label %loop + +exit: + %bool = icmp eq i8 %contain, 0 + ret i1 %bool +} + declare i1 @cond() +declare void @side.effect() + diff --git a/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll index d3713be8358db4..0308d513240fef 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll @@ -6,35 +6,29 @@ define i32 @foo.1(i32 %arg, ptr %arg1) { ; CHECK-SAME: i32 [[ARG:%.*]], ptr [[ARG1:%.*]]) { ; CHECK-NEXT: [[BB:.*]]: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x ptr], align 16 -; CHECK-NEXT: store ptr blockaddress(@foo.1, %[[BB8:.*]]), ptr [[ALLOCA]], align 16 +; CHECK-NEXT: store ptr blockaddress(@foo.1, %[[BB2:.*]]), ptr [[ALLOCA]], align 16 ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ALLOCA]], i64 0, i64 1 ; CHECK-NEXT: store ptr blockaddress(@foo.1, %[[BB16:.*]]), ptr [[GETELEMENTPTR]], align 8 -; CHECK-NEXT: br label %[[PREFBB2:.*]] -; CHECK: [[PREFBB2]]: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI14:%.*]], %[[BB13:.*]] ] -; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI15:%.*]], %[[BB13]] ] -; CHECK-NEXT: switch i32 [[PHI]], label %[[BB13]] [ -; CHECK-NEXT: i32 0, label %[[PREFBB18:.*]] -; CHECK-NEXT: i32 1, label %[[BB8]] +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ 2, %[[BB18:.*]] ] +; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[ARG]], %[[BB18]] ] +; CHECK-NEXT: switch i32 [[PHI]], label %[[BB2_UNREACHABLEDEFAULT:.*]] [ +; CHECK-NEXT: i32 0, label %[[BB18]] ; CHECK-NEXT: i32 2, label %[[PREFBB11:.*]] ; CHECK-NEXT: ] -; CHECK: [[BB8]]: -; CHECK-NEXT: [[PHI10:%.*]] = phi i32 [ [[ARG]], %[[PREFBB18]] ], [ [[PHI3]], %[[PREFBB2]] ] -; CHECK-NEXT: br label %[[BB13]] ; CHECK: [[PREFBB11]]: ; CHECK-NEXT: [[CALL:%.*]] = call i32 @wombat(i32 noundef [[PHI3]]) ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[PHI3]], 1 -; CHECK-NEXT: br label %[[PREFBB18]] -; CHECK: [[BB13]]: -; CHECK-NEXT: [[PHI14]] = phi i32 [ [[PHI]], %[[PREFBB2]] ], [ 2, %[[BB8]] ] -; CHECK-NEXT: [[PHI15]] = phi i32 [ [[PHI3]], %[[PREFBB2]] ], [ [[PHI10]], %[[BB8]] ] -; CHECK-NEXT: br label %[[PREFBB2]] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB2_UNREACHABLEDEFAULT]]: +; CHECK-NEXT: unreachable ; CHECK: [[BB16]]: ; CHECK-NEXT: [[CALL17:%.*]] = call i32 @wombat(i32 noundef [[ARG]]) ; CHECK-NEXT: ret i32 0 -; CHECK: [[PREFBB18]]: +; CHECK: [[BB18]]: ; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[ARG1]], align 8 -; CHECK-NEXT: indirectbr ptr [[LOAD]], [label %[[BB8]], label %bb16] +; CHECK-NEXT: indirectbr ptr [[LOAD]], [label %[[BB2]], label %bb16] ; bb: %alloca = alloca [2 x ptr], align 16 From afcbcae668f1d8061974247f2828190173aef742 Mon Sep 17 00:00:00 2001 From: Anchu Rajendran S Date: Wed, 22 Jan 2025 09:53:54 -0800 Subject: [PATCH 020/208] [mlir][OpenMP] inscan reduction modifier and scan op mlir support (#114737) Scan directive allows to specify scan reductions within an worksharing loop, worksharing loop simd or simd directive which should have an `InScan` modifier associated with it. This change adds the mlir support for the same. Related PR: [Parsing and Semantic Support for scan](https://github.com/llvm/llvm-project/pull/102792) --- .../mlir/Dialect/OpenMP/OpenMPClauses.td | 84 +++++++++- .../mlir/Dialect/OpenMP/OpenMPEnums.td | 21 +++ mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 34 +++- .../Conversion/SCFToOpenMP/SCFToOpenMP.cpp | 1 + mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 147 ++++++++++++------ .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 17 +- mlir/test/Dialect/OpenMP/invalid.mlir | 104 +++++++++++++ mlir/test/Dialect/OpenMP/ops.mlir | 23 +++ mlir/test/Target/LLVMIR/openmp-todo.mlir | 31 ++++ 9 files changed, 403 insertions(+), 59 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td index 8af054be322a55..a8d97a36df79ee 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td @@ -334,6 +334,43 @@ class OpenMP_DoacrossClauseSkip< def OpenMP_DoacrossClause : OpenMP_DoacrossClauseSkip<>; +//===----------------------------------------------------------------------===// +// V5.2: [5.4.7] `exclusive` clause +//===----------------------------------------------------------------------===// + +class OpenMP_ExclusiveClauseSkip< + bit traits = false, bit arguments = false, bit assemblyFormat = false, + bit description = false, bit extraClassDeclaration = false + > : OpenMP_Clause { + let arguments = (ins + Variadic:$exclusive_vars + ); + + let optAssemblyFormat = [{ + `exclusive` `(` $exclusive_vars `:` type($exclusive_vars) `)` + }]; + + let extraClassDeclaration = [{ + bool hasExclusiveVars() { + return !getExclusiveVars().empty(); + } + }]; + + let description = [{ + The exclusive clause is used on a separating directive that separates a + structured block into two structured block sequences. If it + is specified, the input phase excludes the preceding structured block + sequence and instead includes the following structured block sequence, + while the scan phase includes the preceding structured block sequence. + + The `exclusive_vars` is a variadic list of operands that specifies the + scan-reduction accumulator symbols. + }]; +} + +def OpenMP_ExclusiveClause : OpenMP_ExclusiveClauseSkip<>; + //===----------------------------------------------------------------------===// // V5.2: [10.5.1] `filter` clause //===----------------------------------------------------------------------===// @@ -444,6 +481,43 @@ class OpenMP_HasDeviceAddrClauseSkip< def OpenMP_HasDeviceAddrClause : OpenMP_HasDeviceAddrClauseSkip<>; +//===----------------------------------------------------------------------===// +// V5.2: [5.4.7] `inclusive` clause +//===----------------------------------------------------------------------===// + +class OpenMP_InclusiveClauseSkip< + bit traits = false, bit arguments = false, bit assemblyFormat = false, + bit description = false, bit extraClassDeclaration = false + > : OpenMP_Clause { + let arguments = (ins + Variadic:$inclusive_vars + ); + + let optAssemblyFormat = [{ + `inclusive` `(` $inclusive_vars `:` type($inclusive_vars) `)` + }]; + + let extraClassDeclaration = [{ + bool hasInclusiveVars() { + return !getInclusiveVars().empty(); + } + }]; + + let description = [{ + The inclusive clause is used on a separating directive that separates a + structured block into two structured block sequences. If it is specified, + the input phase includes the preceding structured block sequence and the + scan phase includes the following structured block sequence. + + The `inclusive_vars` is a variadic list of operands that specifies the + scan-reduction accumulator symbols. + }]; +} + +def OpenMP_InclusiveClause : OpenMP_InclusiveClauseSkip<>; + + //===----------------------------------------------------------------------===// // V5.2: [15.1.2] `hint` clause //===----------------------------------------------------------------------===// @@ -1100,6 +1174,7 @@ class OpenMP_ReductionClauseSkip< ]; let arguments = (ins + OptionalAttr:$reduction_mod, Variadic:$reduction_vars, OptionalAttr:$reduction_byref, OptionalAttr:$reduction_syms @@ -1113,10 +1188,11 @@ class OpenMP_ReductionClauseSkip< // Description varies depending on the operation. let description = [{ - Reductions can be performed by specifying reduction accumulator variables in - `reduction_vars`, symbols referring to reduction declarations in the - `reduction_syms` attribute, and whether the reduction variable should be - passed into the reduction region by value or by reference in + Reductions can be performed by specifying the reduction modifer + (`default`, `inscan` or `task`) in `reduction_mod`, reduction accumulator + variables in `reduction_vars`, symbols referring to reduction declarations + in the `reduction_syms` attribute, and whether the reduction variable + should be passed into the reduction region by value or by reference in `reduction_byref`. Each reduction is identified by the accumulator it uses and accumulators must not be repeated in the same reduction. A private variable corresponding to the accumulator is used in place of the diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td index 2091c0c76dff72..690e3df1f685e3 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td @@ -179,6 +179,27 @@ def OrderModifier def OrderModifierAttr : EnumAttr; +//===----------------------------------------------------------------------===// +// reduction_modifier enum. +//===----------------------------------------------------------------------===// + +def ReductionModifierDefault : I32EnumAttrCase<"defaultmod", 0>; +def ReductionModifierInscan : I32EnumAttrCase<"inscan", 1>; +def ReductionModifierTask : I32EnumAttrCase<"task", 2>; + +def ReductionModifier : OpenMP_I32EnumAttr< + "ReductionModifier", + "reduction modifier", [ + ReductionModifierDefault, + ReductionModifierInscan, + ReductionModifierTask + ]>; + +def ReductionModifierAttr : OpenMP_EnumAttr { + let assemblyFormat = "`(` $value `)`"; +} + //===----------------------------------------------------------------------===// // sched_mod enum. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index c5b88904367086..580c9c6ef6fde8 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -178,7 +178,7 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [ let assemblyFormat = clausesAssemblyFormat # [{ custom($region, $private_vars, type($private_vars), - $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $private_syms, $reduction_mod, $reduction_vars, type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict }]; @@ -223,7 +223,7 @@ def TeamsOp : OpenMP_Op<"teams", traits = [ let assemblyFormat = clausesAssemblyFormat # [{ custom($region, $private_vars, type($private_vars), - $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $private_syms, $reduction_mod, $reduction_vars, type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict }]; @@ -282,7 +282,7 @@ def SectionsOp : OpenMP_Op<"sections", traits = [ let assemblyFormat = clausesAssemblyFormat # [{ custom($region, $private_vars, type($private_vars), - $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $private_syms, $reduction_mod, $reduction_vars, type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict }]; @@ -469,7 +469,7 @@ def LoopOp : OpenMP_Op<"loop", traits = [ let assemblyFormat = clausesAssemblyFormat # [{ custom($region, $private_vars, type($private_vars), - $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $private_syms, $reduction_mod, $reduction_vars, type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict }]; @@ -521,7 +521,7 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [ let assemblyFormat = clausesAssemblyFormat # [{ custom($region, $private_vars, type($private_vars), - $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $private_syms, $reduction_mod, $reduction_vars, type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict }]; @@ -575,7 +575,7 @@ def SimdOp : OpenMP_Op<"simd", traits = [ let assemblyFormat = clausesAssemblyFormat # [{ custom($region, $private_vars, type($private_vars), - $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $private_syms, $reduction_mod, $reduction_vars, type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict }]; @@ -782,7 +782,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [ custom( $region, $in_reduction_vars, type($in_reduction_vars), $in_reduction_byref, $in_reduction_syms, $private_vars, - type($private_vars), $private_syms, $reduction_vars, + type($private_vars), $private_syms, $reduction_mod, $reduction_vars, type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict }]; @@ -1706,6 +1706,26 @@ def CancellationPointOp : OpenMP_Op<"cancellation_point", clauses = [ let hasVerifier = 1; } +def ScanOp : OpenMP_Op<"scan", [ + AttrSizedOperandSegments, MemoryEffects<[MemWrite]> + ], clauses = [ + OpenMP_InclusiveClause, OpenMP_ExclusiveClause]> { + let summary = "scan directive"; + let description = [{ + The scan directive allows to specify scan reductions. It should be + enclosed within a parent directive along with which a reduction clause + with `inscan` modifier must be specified. The scan directive allows to + split code blocks into input phase and scan phase in the region + enclosed by the parent. + }] # clausesDescription; + + let builders = [ + OpBuilder<(ins CArg<"const ScanOperands &">:$clauses)> + ]; + + let hasVerifier = 1; +} + //===----------------------------------------------------------------------===// // 2.19.5.7 declare reduction Directive //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp index aa241b91d758ca..233739e1d6d917 100644 --- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp +++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp @@ -451,6 +451,7 @@ struct ParallelOpLowering : public OpRewritePattern { /* private_vars = */ ValueRange(), /* private_syms = */ nullptr, /* proc_bind_kind = */ omp::ClauseProcBindKindAttr{}, + /* reduction_mod = */ nullptr, /* reduction_vars = */ llvm::SmallVector{}, /* reduction_byref = */ DenseBoolArrayAttr{}, /* reduction_syms = */ ArrayAttr{}); diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 5a619254a5ee14..88f56dc5144229 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -494,16 +494,19 @@ struct PrivateParseArgs { DenseI64ArrayAttr *mapIndices = nullptr) : vars(vars), types(types), syms(syms), mapIndices(mapIndices) {} }; + struct ReductionParseArgs { SmallVectorImpl &vars; SmallVectorImpl &types; DenseBoolArrayAttr &byref; ArrayAttr &syms; + ReductionModifierAttr *modifier; ReductionParseArgs(SmallVectorImpl &vars, SmallVectorImpl &types, DenseBoolArrayAttr &byref, - ArrayAttr &syms) - : vars(vars), types(types), byref(byref), syms(syms) {} + ArrayAttr &syms, ReductionModifierAttr *mod = nullptr) + : vars(vars), types(types), byref(byref), syms(syms), modifier(mod) {} }; + struct AllRegionParseArgs { std::optional hostEvalArgs; std::optional inReductionArgs; @@ -522,7 +525,8 @@ static ParseResult parseClauseWithRegionArgs( SmallVectorImpl &types, SmallVectorImpl ®ionPrivateArgs, ArrayAttr *symbols = nullptr, DenseI64ArrayAttr *mapIndices = nullptr, - DenseBoolArrayAttr *byref = nullptr) { + DenseBoolArrayAttr *byref = nullptr, + ReductionModifierAttr *modifier = nullptr) { SmallVector symbolVec; SmallVector mapIndicesVec; SmallVector isByRefVec; @@ -531,6 +535,20 @@ static ParseResult parseClauseWithRegionArgs( if (parser.parseLParen()) return failure(); + if (modifier && succeeded(parser.parseOptionalKeyword("mod"))) { + StringRef enumStr; + if (parser.parseColon() || parser.parseKeyword(&enumStr) || + parser.parseComma()) + return failure(); + std::optional enumValue = + symbolizeReductionModifier(enumStr); + if (!enumValue.has_value()) + return failure(); + *modifier = ReductionModifierAttr::get(parser.getContext(), *enumValue); + if (!*modifier) + return failure(); + } + if (parser.parseCommaSeparatedList([&]() { if (byref) isByRefVec.push_back( @@ -635,11 +653,10 @@ static ParseResult parseBlockArgClause( if (succeeded(parser.parseOptionalKeyword(keyword))) { if (!reductionArgs) return failure(); - if (failed(parseClauseWithRegionArgs( parser, reductionArgs->vars, reductionArgs->types, entryBlockArgs, - &reductionArgs->syms, /*mapIndices=*/nullptr, - &reductionArgs->byref))) + &reductionArgs->syms, /*mapIndices=*/nullptr, &reductionArgs->byref, + reductionArgs->modifier))) return failure(); } return success(); @@ -735,6 +752,7 @@ static ParseResult parseInReductionPrivateReductionRegion( DenseBoolArrayAttr &inReductionByref, ArrayAttr &inReductionSyms, llvm::SmallVectorImpl &privateVars, llvm::SmallVectorImpl &privateTypes, ArrayAttr &privateSyms, + ReductionModifierAttr &reductionMod, SmallVectorImpl &reductionVars, SmallVectorImpl &reductionTypes, DenseBoolArrayAttr &reductionByref, ArrayAttr &reductionSyms) { @@ -743,7 +761,7 @@ static ParseResult parseInReductionPrivateReductionRegion( inReductionByref, inReductionSyms); args.privateArgs.emplace(privateVars, privateTypes, privateSyms); args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref, - reductionSyms); + reductionSyms, &reductionMod); return parseBlockArgRegion(parser, region, args); } @@ -760,13 +778,14 @@ static ParseResult parsePrivateReductionRegion( OpAsmParser &parser, Region ®ion, llvm::SmallVectorImpl &privateVars, llvm::SmallVectorImpl &privateTypes, ArrayAttr &privateSyms, + ReductionModifierAttr &reductionMod, SmallVectorImpl &reductionVars, SmallVectorImpl &reductionTypes, DenseBoolArrayAttr &reductionByref, ArrayAttr &reductionSyms) { AllRegionParseArgs args; args.privateArgs.emplace(privateVars, privateTypes, privateSyms); args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref, - reductionSyms); + reductionSyms, &reductionMod); return parseBlockArgRegion(parser, region, args); } @@ -817,9 +836,10 @@ struct ReductionPrintArgs { TypeRange types; DenseBoolArrayAttr byref; ArrayAttr syms; + ReductionModifierAttr modifier; ReductionPrintArgs(ValueRange vars, TypeRange types, DenseBoolArrayAttr byref, - ArrayAttr syms) - : vars(vars), types(types), byref(byref), syms(syms) {} + ArrayAttr syms, ReductionModifierAttr mod = nullptr) + : vars(vars), types(types), byref(byref), syms(syms), modifier(mod) {} }; struct AllRegionPrintArgs { std::optional hostEvalArgs; @@ -833,18 +853,20 @@ struct AllRegionPrintArgs { }; } // namespace -static void printClauseWithRegionArgs(OpAsmPrinter &p, MLIRContext *ctx, - StringRef clauseName, - ValueRange argsSubrange, - ValueRange operands, TypeRange types, - ArrayAttr symbols = nullptr, - DenseI64ArrayAttr mapIndices = nullptr, - DenseBoolArrayAttr byref = nullptr) { +static void printClauseWithRegionArgs( + OpAsmPrinter &p, MLIRContext *ctx, StringRef clauseName, + ValueRange argsSubrange, ValueRange operands, TypeRange types, + ArrayAttr symbols = nullptr, DenseI64ArrayAttr mapIndices = nullptr, + DenseBoolArrayAttr byref = nullptr, + ReductionModifierAttr modifier = nullptr) { if (argsSubrange.empty()) return; p << clauseName << "("; + if (modifier) + p << "mod: " << stringifyReductionModifier(modifier.getValue()) << ", "; + if (!symbols) { llvm::SmallVector values(operands.size(), nullptr); symbols = ArrayAttr::get(ctx, values); @@ -905,7 +927,7 @@ printBlockArgClause(OpAsmPrinter &p, MLIRContext *ctx, StringRef clauseName, printClauseWithRegionArgs(p, ctx, clauseName, argsSubrange, reductionArgs->vars, reductionArgs->types, reductionArgs->syms, /*mapIndices=*/nullptr, - reductionArgs->byref); + reductionArgs->byref, reductionArgs->modifier); } static void printBlockArgRegion(OpAsmPrinter &p, Operation *op, Region ®ion, @@ -968,7 +990,8 @@ static void printInReductionPrivateReductionRegion( OpAsmPrinter &p, Operation *op, Region ®ion, ValueRange inReductionVars, TypeRange inReductionTypes, DenseBoolArrayAttr inReductionByref, ArrayAttr inReductionSyms, ValueRange privateVars, TypeRange privateTypes, - ArrayAttr privateSyms, ValueRange reductionVars, TypeRange reductionTypes, + ArrayAttr privateSyms, ReductionModifierAttr reductionMod, + ValueRange reductionVars, TypeRange reductionTypes, DenseBoolArrayAttr reductionByref, ArrayAttr reductionSyms) { AllRegionPrintArgs args; args.inReductionArgs.emplace(inReductionVars, inReductionTypes, @@ -976,7 +999,7 @@ static void printInReductionPrivateReductionRegion( args.privateArgs.emplace(privateVars, privateTypes, privateSyms, /*mapIndices=*/nullptr); args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref, - reductionSyms); + reductionSyms, reductionMod); printBlockArgRegion(p, op, region, args); } @@ -991,14 +1014,15 @@ static void printPrivateRegion(OpAsmPrinter &p, Operation *op, Region ®ion, static void printPrivateReductionRegion( OpAsmPrinter &p, Operation *op, Region ®ion, ValueRange privateVars, - TypeRange privateTypes, ArrayAttr privateSyms, ValueRange reductionVars, + TypeRange privateTypes, ArrayAttr privateSyms, + ReductionModifierAttr reductionMod, ValueRange reductionVars, TypeRange reductionTypes, DenseBoolArrayAttr reductionByref, ArrayAttr reductionSyms) { AllRegionPrintArgs args; args.privateArgs.emplace(privateVars, privateTypes, privateSyms, /*mapIndices=*/nullptr); args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref, - reductionSyms); + reductionSyms, reductionMod); printBlockArgRegion(p, op, region, args); } @@ -1942,7 +1966,7 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state, /*allocator_vars=*/ValueRange(), /*if_expr=*/nullptr, /*num_threads=*/nullptr, /*private_vars=*/ValueRange(), /*private_syms=*/nullptr, /*proc_bind_kind=*/nullptr, - /*reduction_vars=*/ValueRange(), + /*reduction_mod =*/nullptr, /*reduction_vars=*/ValueRange(), /*reduction_byref=*/nullptr, /*reduction_syms=*/nullptr); state.addAttributes(attributes); } @@ -1953,7 +1977,8 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state, ParallelOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars, clauses.ifExpr, clauses.numThreads, clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms), - clauses.procBindKind, clauses.reductionVars, + clauses.procBindKind, clauses.reductionMod, + clauses.reductionVars, makeDenseBoolArrayAttr(ctx, clauses.reductionByref), makeArrayAttr(ctx, clauses.reductionSyms)); } @@ -2052,12 +2077,13 @@ void TeamsOp::build(OpBuilder &builder, OperationState &state, const TeamsOperands &clauses) { MLIRContext *ctx = builder.getContext(); // TODO Store clauses in op: privateVars, privateSyms. - TeamsOp::build( - builder, state, clauses.allocateVars, clauses.allocatorVars, - clauses.ifExpr, clauses.numTeamsLower, clauses.numTeamsUpper, - /*private_vars=*/{}, /*private_syms=*/nullptr, clauses.reductionVars, - makeDenseBoolArrayAttr(ctx, clauses.reductionByref), - makeArrayAttr(ctx, clauses.reductionSyms), clauses.threadLimit); + TeamsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars, + clauses.ifExpr, clauses.numTeamsLower, clauses.numTeamsUpper, + /*private_vars=*/{}, /*private_syms=*/nullptr, + clauses.reductionMod, clauses.reductionVars, + makeDenseBoolArrayAttr(ctx, clauses.reductionByref), + makeArrayAttr(ctx, clauses.reductionSyms), + clauses.threadLimit); } LogicalResult TeamsOp::verify() { @@ -2114,7 +2140,8 @@ void SectionsOp::build(OpBuilder &builder, OperationState &state, // TODO Store clauses in op: privateVars, privateSyms. SectionsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars, clauses.nowait, /*private_vars=*/{}, - /*private_syms=*/nullptr, clauses.reductionVars, + /*private_syms=*/nullptr, clauses.reductionMod, + clauses.reductionVars, makeDenseBoolArrayAttr(ctx, clauses.reductionByref), makeArrayAttr(ctx, clauses.reductionSyms)); } @@ -2221,7 +2248,7 @@ void LoopOp::build(OpBuilder &builder, OperationState &state, LoopOp::build(builder, state, clauses.bindKind, clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms), clauses.order, - clauses.orderMod, clauses.reductionVars, + clauses.orderMod, clauses.reductionMod, clauses.reductionVars, makeDenseBoolArrayAttr(ctx, clauses.reductionByref), makeArrayAttr(ctx, clauses.reductionSyms)); } @@ -2249,7 +2276,8 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state, /*linear_vars=*/ValueRange(), /*linear_step_vars=*/ValueRange(), /*nowait=*/false, /*order=*/nullptr, /*order_mod=*/nullptr, /*ordered=*/nullptr, /*private_vars=*/{}, /*private_syms=*/nullptr, - /*reduction_vars=*/ValueRange(), /*reduction_byref=*/nullptr, + /*reduction_mod=*/nullptr, /*reduction_vars=*/ValueRange(), + /*reduction_byref=*/nullptr, /*reduction_syms=*/nullptr, /*schedule_kind=*/nullptr, /*schedule_chunk=*/nullptr, /*schedule_mod=*/nullptr, /*schedule_simd=*/false); @@ -2261,15 +2289,16 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state, MLIRContext *ctx = builder.getContext(); // TODO: Store clauses in op: allocateVars, allocatorVars, privateVars, // privateSyms. - WsloopOp::build( - builder, state, - /*allocate_vars=*/{}, /*allocator_vars=*/{}, clauses.linearVars, - clauses.linearStepVars, clauses.nowait, clauses.order, clauses.orderMod, - clauses.ordered, clauses.privateVars, - makeArrayAttr(ctx, clauses.privateSyms), clauses.reductionVars, - makeDenseBoolArrayAttr(ctx, clauses.reductionByref), - makeArrayAttr(ctx, clauses.reductionSyms), clauses.scheduleKind, - clauses.scheduleChunk, clauses.scheduleMod, clauses.scheduleSimd); + WsloopOp::build(builder, state, + /*allocate_vars=*/{}, /*allocator_vars=*/{}, + clauses.linearVars, clauses.linearStepVars, clauses.nowait, + clauses.order, clauses.orderMod, clauses.ordered, + clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms), + clauses.reductionMod, clauses.reductionVars, + makeDenseBoolArrayAttr(ctx, clauses.reductionByref), + makeArrayAttr(ctx, clauses.reductionSyms), + clauses.scheduleKind, clauses.scheduleChunk, + clauses.scheduleMod, clauses.scheduleSimd); } LogicalResult WsloopOp::verify() { @@ -2316,7 +2345,7 @@ void SimdOp::build(OpBuilder &builder, OperationState &state, /*linear_vars=*/{}, /*linear_step_vars=*/{}, clauses.nontemporalVars, clauses.order, clauses.orderMod, clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms), - clauses.reductionVars, + clauses.reductionMod, clauses.reductionVars, makeDenseBoolArrayAttr(ctx, clauses.reductionByref), makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen, clauses.simdlen); @@ -2548,7 +2577,7 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state, makeDenseBoolArrayAttr(ctx, clauses.inReductionByref), makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable, clauses.nogroup, clauses.numTasks, clauses.priority, /*private_vars=*/{}, - /*private_syms=*/nullptr, clauses.reductionVars, + /*private_syms=*/nullptr, clauses.reductionMod, clauses.reductionVars, makeDenseBoolArrayAttr(ctx, clauses.reductionByref), makeArrayAttr(ctx, clauses.reductionSyms), clauses.untied); } @@ -3125,6 +3154,36 @@ void MaskedOp::build(OpBuilder &builder, OperationState &state, MaskedOp::build(builder, state, clauses.filteredThreadId); } +//===----------------------------------------------------------------------===// +// Spec 5.2: Scan construct (5.6) +//===----------------------------------------------------------------------===// + +void ScanOp::build(OpBuilder &builder, OperationState &state, + const ScanOperands &clauses) { + ScanOp::build(builder, state, clauses.inclusiveVars, clauses.exclusiveVars); +} + +LogicalResult ScanOp::verify() { + if (hasExclusiveVars() == hasInclusiveVars()) + return emitError( + "Exactly one of EXCLUSIVE or INCLUSIVE clause is expected"); + if (WsloopOp parentWsLoopOp = (*this)->getParentOfType()) { + if (parentWsLoopOp.getReductionModAttr() && + parentWsLoopOp.getReductionModAttr().getValue() == + ReductionModifier::inscan) + return success(); + } + if (SimdOp parentSimdOp = (*this)->getParentOfType()) { + if (parentSimdOp.getReductionModAttr() && + parentSimdOp.getReductionModAttr().getValue() == + ReductionModifier::inscan) + return success(); + } + return emitError("SCAN directive needs to be enclosed within a parent " + "worksharing loop construct or SIMD construct with INSCAN " + "reduction modifier"); +} + #define GET_ATTRDEF_CLASSES #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc" diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 29089cb28a5a8e..3fcdefa8a2f673 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -237,9 +237,13 @@ static LogicalResult checkImplementationStatus(Operation &op) { } }; auto checkReduction = [&todo](auto op, LogicalResult &result) { - if (!op.getReductionVars().empty() || op.getReductionByref() || - op.getReductionSyms()) - result = todo("reduction"); + if (isa(op) || isa(op)) + if (!op.getReductionVars().empty() || op.getReductionByref() || + op.getReductionSyms()) + result = todo("reduction"); + if (op.getReductionMod() && + op.getReductionMod().value() != omp::ReductionModifier::defaultmod) + result = todo("reduction with modifier"); }; auto checkTaskReduction = [&todo](auto op, LogicalResult &result) { if (!op.getTaskReductionVars().empty() || op.getTaskReductionByref() || @@ -257,6 +261,7 @@ static LogicalResult checkImplementationStatus(Operation &op) { .Case([&](omp::SectionsOp op) { checkAllocate(op, result); checkPrivate(op, result); + checkReduction(op, result); }) .Case([&](omp::SingleOp op) { checkAllocate(op, result); @@ -288,8 +293,12 @@ static LogicalResult checkImplementationStatus(Operation &op) { checkAllocate(op, result); checkLinear(op, result); checkOrder(op, result); + checkReduction(op, result); + }) + .Case([&](omp::ParallelOp op) { + checkAllocate(op, result); + checkReduction(op, result); }) - .Case([&](omp::ParallelOp op) { checkAllocate(op, result); }) .Case([&](omp::SimdOp op) { checkLinear(op, result); checkNontemporal(op, result); diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index c611614265592c..06fcf90e34480b 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -1825,6 +1825,110 @@ func.func @omp_cancellationpoint2() { // ----- +omp.declare_reduction @add_f32 : f32 +init { + ^bb0(%arg: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { + ^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +func.func @scan_test_2(%lb: i32, %ub: i32, %step: i32) { + %test1f32 = "test.f32"() : () -> (!llvm.ptr) + omp.wsloop reduction(mod:inscan, @add_f32 %test1f32 -> %arg1 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // expected-error @below {{Exactly one of EXCLUSIVE or INCLUSIVE clause is expected}} + omp.scan + omp.yield + } + } + return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { + ^bb0(%arg: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { + ^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +func.func @scan_test_2(%lb: i32, %ub: i32, %step: i32) { + %test1f32 = "test.f32"() : () -> (!llvm.ptr) + omp.wsloop reduction(mod:inscan, @add_f32 %test1f32 -> %arg1 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // expected-error @below {{Exactly one of EXCLUSIVE or INCLUSIVE clause is expected}} + omp.scan inclusive(%test1f32 : !llvm.ptr) exclusive(%test1f32: !llvm.ptr) + omp.yield + } + } + return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { + ^bb0(%arg: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { + ^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +func.func @scan_test_2(%lb: i32, %ub: i32, %step: i32) { + %test1f32 = "test.f32"() : () -> (!llvm.ptr) + omp.wsloop reduction(@add_f32 %test1f32 -> %arg1 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // expected-error @below {{SCAN directive needs to be enclosed within a parent worksharing loop construct or SIMD construct with INSCAN reduction modifier}} + omp.scan inclusive(%test1f32 : !llvm.ptr) + omp.yield + } + } + return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { + ^bb0(%arg: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { + ^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +func.func @scan_test_2(%lb: i32, %ub: i32, %step: i32) { + %test1f32 = "test.f32"() : () -> (!llvm.ptr) + omp.taskloop reduction(mod:inscan, @add_f32 %test1f32 -> %arg1 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // expected-error @below {{SCAN directive needs to be enclosed within a parent worksharing loop construct or SIMD construct with INSCAN reduction modifier}} + omp.scan inclusive(%test1f32 : !llvm.ptr) + omp.yield + } + } + return +} + +// ----- + func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testmemref = "test.memref"() : () -> (memref) // expected-error @below {{expected equal sizes for allocate and allocator variables}} diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index b1901c333ade8d..c1259fabe82fba 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -900,6 +900,29 @@ func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) { return } +// CHECK-LABEL: func @wsloop_inscan_reduction +func.func @wsloop_inscan_reduction(%lb : index, %ub : index, %step : index) { + %c1 = arith.constant 1 : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + // CHECK: reduction(mod: inscan, @add_f32 %{{.+}} -> %[[PRV:.+]] : !llvm.ptr) + omp.wsloop reduction(mod:inscan, @add_f32 %0 -> %prv : !llvm.ptr) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + // CHECK: omp.scan inclusive(%{{.*}} : !llvm.ptr) + omp.scan inclusive(%prv : !llvm.ptr) + omp.yield + } + } + // CHECK: reduction(mod: inscan, @add_f32 %{{.+}} -> %[[PRV:.+]] : !llvm.ptr) + omp.wsloop reduction(mod:inscan, @add_f32 %0 -> %prv : !llvm.ptr) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + // CHECK: omp.scan exclusive(%{{.*}} : !llvm.ptr) + omp.scan exclusive(%prv : !llvm.ptr) + omp.yield + } + } + return +} + // CHECK-LABEL: func @wsloop_reduction_byref func.func @wsloop_reduction_byref(%lb : index, %ub : index, %step : index) { %c1 = arith.constant 1 : i32 diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index bb2a74841e9afb..457bb87fae27ad 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -186,6 +186,37 @@ llvm.func @simd_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { // ----- +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} +llvm.func @scan_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{not yet implemented: Unhandled clause reduction with modifier in omp.wsloop operation}} + // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}} + omp.wsloop reduction(mod:inscan, @add_f32 %x -> %prv : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.scan inclusive(%prv : !llvm.ptr) + omp.yield + } + } + llvm.return +} + +// ----- + llvm.func @single_allocate(%x : !llvm.ptr) { // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.single operation}} // expected-error@below {{LLVM Translation failed for operation: omp.single}} From 9f83c4ed1c8d09bf4c246ec6dd758bde1756f60c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 22 Jan 2025 10:04:39 -0800 Subject: [PATCH 021/208] [flang][cuda] Allocate descriptor in managed memory on rebox block argument (#123971) Another case where the descriptor must be allocated with the CUF runtime and not a simple alloca instruction. --- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 38 +++++++++++++------------ flang/test/Fir/CUDA/cuda-code-gen.mlir | 11 +++++++ 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 43c0e2686a8c3b..6ff2c20d744537 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -2040,19 +2040,20 @@ struct XReboxOpConversion : public EmboxCommonConversion { getBaseAddrFromBox(loc, inputBoxTyPair, loweredBox, rewriter); if (!rebox.getSlice().empty() || !rebox.getSubcomponent().empty()) - return sliceBox(rebox, boxTy, dest, baseAddr, inputExtents, inputStrides, - operands, rewriter); - return reshapeBox(rebox, boxTy, dest, baseAddr, inputExtents, inputStrides, - operands, rewriter); + return sliceBox(rebox, adaptor, boxTy, dest, baseAddr, inputExtents, + inputStrides, operands, rewriter); + return reshapeBox(rebox, adaptor, boxTy, dest, baseAddr, inputExtents, + inputStrides, operands, rewriter); } private: /// Write resulting shape and base address in descriptor, and replace rebox /// op. llvm::LogicalResult - finalizeRebox(fir::cg::XReboxOp rebox, mlir::Type destBoxTy, mlir::Value dest, - mlir::Value base, mlir::ValueRange lbounds, - mlir::ValueRange extents, mlir::ValueRange strides, + finalizeRebox(fir::cg::XReboxOp rebox, OpAdaptor adaptor, + mlir::Type destBoxTy, mlir::Value dest, mlir::Value base, + mlir::ValueRange lbounds, mlir::ValueRange extents, + mlir::ValueRange strides, mlir::ConversionPatternRewriter &rewriter) const { mlir::Location loc = rebox.getLoc(); mlir::Value zero = @@ -2075,15 +2076,15 @@ struct XReboxOpConversion : public EmboxCommonConversion { dest = insertBaseAddress(rewriter, loc, dest, base); mlir::Value result = placeInMemoryIfNotGlobalInit( rewriter, rebox.getLoc(), destBoxTy, dest, - isDeviceAllocation(rebox.getBox(), rebox.getBox())); + isDeviceAllocation(rebox.getBox(), adaptor.getBox())); rewriter.replaceOp(rebox, result); return mlir::success(); } // Apply slice given the base address, extents and strides of the input box. llvm::LogicalResult - sliceBox(fir::cg::XReboxOp rebox, mlir::Type destBoxTy, mlir::Value dest, - mlir::Value base, mlir::ValueRange inputExtents, + sliceBox(fir::cg::XReboxOp rebox, OpAdaptor adaptor, mlir::Type destBoxTy, + mlir::Value dest, mlir::Value base, mlir::ValueRange inputExtents, mlir::ValueRange inputStrides, mlir::ValueRange operands, mlir::ConversionPatternRewriter &rewriter) const { mlir::Location loc = rebox.getLoc(); @@ -2109,7 +2110,7 @@ struct XReboxOpConversion : public EmboxCommonConversion { if (rebox.getSlice().empty()) // The array section is of the form array[%component][substring], keep // the input array extents and strides. - return finalizeRebox(rebox, destBoxTy, dest, base, + return finalizeRebox(rebox, adaptor, destBoxTy, dest, base, /*lbounds*/ std::nullopt, inputExtents, inputStrides, rewriter); @@ -2158,15 +2159,16 @@ struct XReboxOpConversion : public EmboxCommonConversion { slicedStrides.emplace_back(stride); } } - return finalizeRebox(rebox, destBoxTy, dest, base, /*lbounds*/ std::nullopt, - slicedExtents, slicedStrides, rewriter); + return finalizeRebox(rebox, adaptor, destBoxTy, dest, base, + /*lbounds*/ std::nullopt, slicedExtents, slicedStrides, + rewriter); } /// Apply a new shape to the data described by a box given the base address, /// extents and strides of the box. llvm::LogicalResult - reshapeBox(fir::cg::XReboxOp rebox, mlir::Type destBoxTy, mlir::Value dest, - mlir::Value base, mlir::ValueRange inputExtents, + reshapeBox(fir::cg::XReboxOp rebox, OpAdaptor adaptor, mlir::Type destBoxTy, + mlir::Value dest, mlir::Value base, mlir::ValueRange inputExtents, mlir::ValueRange inputStrides, mlir::ValueRange operands, mlir::ConversionPatternRewriter &rewriter) const { mlir::ValueRange reboxShifts{ @@ -2175,7 +2177,7 @@ struct XReboxOpConversion : public EmboxCommonConversion { rebox.getShift().size()}; if (rebox.getShape().empty()) { // Only setting new lower bounds. - return finalizeRebox(rebox, destBoxTy, dest, base, reboxShifts, + return finalizeRebox(rebox, adaptor, destBoxTy, dest, base, reboxShifts, inputExtents, inputStrides, rewriter); } @@ -2199,8 +2201,8 @@ struct XReboxOpConversion : public EmboxCommonConversion { // nextStride = extent * stride; stride = rewriter.create(loc, idxTy, extent, stride); } - return finalizeRebox(rebox, destBoxTy, dest, base, reboxShifts, newExtents, - newStrides, rewriter); + return finalizeRebox(rebox, adaptor, destBoxTy, dest, base, reboxShifts, + newExtents, newStrides, rewriter); } /// Return scalar element type of the input box. diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir index 7ac89836a3ff16..063454799502af 100644 --- a/flang/test/Fir/CUDA/cuda-code-gen.mlir +++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir @@ -187,3 +187,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec = dense<32> : vec // CHECK-LABEL: llvm.func @_QPouter // CHECK: _FortranACUFAllocDescriptor + +// ----- + +func.func @_QMm1Psub1(%arg0: !fir.box> {cuf.data_attr = #cuf.cuda, fir.bindc_name = "da"}, %arg1: !fir.box> {cuf.data_attr = #cuf.cuda, fir.bindc_name = "db"}, %arg2: !fir.ref {fir.bindc_name = "n"}) { + %0 = fircg.ext_rebox %arg0 : (!fir.box>) -> !fir.box> + %1 = fircg.ext_rebox %arg1 : (!fir.box>) -> !fir.box> + return +} + +// CHECK-LABEL: llvm.func @_QMm1Psub1 +// CHECK-COUNT-2: _FortranACUFAllocDescriptor From 7bf188fa991338e981e8dff120a4ed341ad7f4bd Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Wed, 22 Jan 2025 10:11:53 -0800 Subject: [PATCH 022/208] [NFC] Minor fix to tryEmitAbstract type in EmitCXXNewAllocSize (#123433) In EmitCXXNewAllocSize, when handling a constant array size, we were calling tryEmitAbstract with the type of the object being allocated rather than the expected type of the array size. This worked out because the allocated type was always a pointer and tryEmitAbstract only ends up using the size of the type to extend or truncate the constant, and in this case the destination type should be size_t, which is usually the same width as the pointer. This change fixes the type, but it makes no functional difference with the current constant emitter implementation. --- clang/lib/CodeGen/CGExprCXX.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index 648b9b9ed98063..f71c18a8041b10 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -732,8 +732,8 @@ static llvm::Value *EmitCXXNewAllocSize(CodeGenFunction &CGF, // Emit the array size expression. // We multiply the size of all dimensions for NumElements. // e.g for 'int[2][3]', ElemType is 'int' and NumElements is 6. - numElements = - ConstantEmitter(CGF).tryEmitAbstract(*e->getArraySize(), e->getType()); + numElements = ConstantEmitter(CGF).tryEmitAbstract( + *e->getArraySize(), (*e->getArraySize())->getType()); if (!numElements) numElements = CGF.EmitScalarExpr(*e->getArraySize()); assert(isa(numElements->getType())); From b40739a6e90cfb000b49de819251c1581fd5ee50 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 22 Jan 2025 10:12:27 -0800 Subject: [PATCH 023/208] Revert "[LLVM][Clang][AArch64] Implement AArch64 build attributes (#118771)" This reverts commit d7fb4a275c98f4035d1083b5eb3edd2ffb2da00e. Buildbots failing: https://lab.llvm.org/buildbot/#/builders/169/builds/7671 https://lab.llvm.org/buildbot/#/builders/65/builds/11046 --- llvm/include/llvm/BinaryFormat/ELF.h | 2 - llvm/include/llvm/MC/MCELFStreamer.h | 25 +- .../llvm/Support/AArch64BuildAttributes.h | 75 ----- llvm/lib/MC/MCELFStreamer.cpp | 65 +---- llvm/lib/Support/AArch64BuildAttributes.cpp | 117 -------- llvm/lib/Support/CMakeLists.txt | 1 - llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 105 ++----- .../AArch64/AsmParser/AArch64AsmParser.cpp | 271 +----------------- .../MCTargetDesc/AArch64ELFStreamer.cpp | 148 +--------- .../MCTargetDesc/AArch64TargetStreamer.cpp | 104 ------- .../MCTargetDesc/AArch64TargetStreamer.h | 32 --- .../AArch64/aarch64-build-attributes-all.ll | 22 -- .../AArch64/aarch64-build-attributes-bti.ll | 20 -- .../AArch64/aarch64-build-attributes-gcs.ll | 20 -- .../AArch64/aarch64-build-attributes-pac.ll | 20 -- .../aarch64-build-attributes-pauthabi.ll | 19 -- .../aarch64-build-attributes-asm-all.s | 27 -- .../aarch64-build-attributes-asm-bti.s | 19 -- .../aarch64-build-attributes-asm-err-attrs.s | 70 ----- ...aarch64-build-attributes-asm-err-headers.s | 61 ---- .../aarch64-build-attributes-asm-gcs.s | 19 -- .../aarch64-build-attributes-asm-none.s | 27 -- ...ch64-build-attributes-asm-numerical-tags.s | 41 --- ...arch64-build-attributes-asm-out-of-order.s | 50 ---- .../aarch64-build-attributes-asm-pac.s | 19 -- ...d-attributes-asm-private-subsections-err.s | 28 -- ...build-attributes-asm-private-subsections.s | 51 ---- .../gn/secondary/llvm/lib/Support/BUILD.gn | 1 - 28 files changed, 22 insertions(+), 1437 deletions(-) delete mode 100644 llvm/include/llvm/Support/AArch64BuildAttributes.h delete mode 100644 llvm/lib/Support/AArch64BuildAttributes.cpp delete mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll delete mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll delete mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll delete mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll delete mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s delete mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 48ae0db80f43ee..1bc69f791bd84c 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -1158,8 +1158,6 @@ enum : unsigned { SHT_ARM_ATTRIBUTES = 0x70000003U, SHT_ARM_DEBUGOVERLAY = 0x70000004U, SHT_ARM_OVERLAYSECTION = 0x70000005U, - // Support for AArch64 build attributes - SHT_AARCH64_ATTRIBUTES = 0x70000003U, // Special aarch64-specific section for MTE support, as described in: // https://github.com/ARM-software/abi-aa/blob/main/pauthabielf64/pauthabielf64.rst#section-types SHT_AARCH64_AUTH_RELR = 0x70000004U, diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h index 5a1cdd9e96cad4..94d14088d0f5d2 100644 --- a/llvm/include/llvm/MC/MCELFStreamer.h +++ b/llvm/include/llvm/MC/MCELFStreamer.h @@ -96,7 +96,7 @@ class MCELFStreamer : public MCObjectStreamer { // This structure holds all attributes, accounting for their string / // numeric value, so we can later emit them in declaration order, keeping // all in the same vector. - enum Types { + enum { HiddenAttribute = 0, NumericAttribute, TextAttribute, @@ -105,17 +105,6 @@ class MCELFStreamer : public MCObjectStreamer { unsigned Tag; unsigned IntValue; std::string StringValue; - AttributeItem(Types Ty, unsigned Tg, unsigned IV, std::string SV) - : Type(Ty), Tag(Tg), IntValue(IV), StringValue(SV) {} - }; - - /// ELF object attributes subsection support - struct AttributeSubSection { - bool IsActive; - StringRef VendorName; - unsigned IsOptional; - unsigned ParameterType; - SmallVector Content; }; // Attributes that are added and managed entirely by target. @@ -130,23 +119,13 @@ class MCELFStreamer : public MCObjectStreamer { unsigned Type, MCSection *&AttributeSection) { createAttributesSection(Vendor, Section, Type, AttributeSection, Contents); } - void - emitAttributesSection(MCSection *&AttributeSection, const Twine &Section, - unsigned Type, - SmallVector &SubSectionVec) { - createAttributesWithSubsection(AttributeSection, Section, Type, - SubSectionVec); - } private: AttributeItem *getAttributeItem(unsigned Attribute); - size_t calculateContentSize(SmallVector &AttrsVec) const; + size_t calculateContentSize(SmallVector &AttrsVec); void createAttributesSection(StringRef Vendor, const Twine &Section, unsigned Type, MCSection *&AttributeSection, SmallVector &AttrsVec); - void createAttributesWithSubsection( - MCSection *&AttributeSection, const Twine &Section, unsigned Type, - SmallVector &SubSectionVec); // GNU attributes that will get emitted at the end of the asm file. SmallVector GNUAttributes; diff --git a/llvm/include/llvm/Support/AArch64BuildAttributes.h b/llvm/include/llvm/Support/AArch64BuildAttributes.h deleted file mode 100644 index ea293b72f9bb11..00000000000000 --- a/llvm/include/llvm/Support/AArch64BuildAttributes.h +++ /dev/null @@ -1,75 +0,0 @@ -//===-- AArch64BuildAttributes.h - AARch64 Build Attributes -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains enumerations and support routines for AArch64 build -// attributes as defined in Build Attributes for the AArch64 document. -// -// Build Attributes for the Arm® 64-bit Architecture (AArch64) 2024Q1 -// -// https://github.com/ARM-software/abi-aa/pull/230 -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_SUPPORT_AARCH64BUILDATTRIBUTES_H -#define LLVM_SUPPORT_AARCH64BUILDATTRIBUTES_H - -#include "llvm/ADT/StringRef.h" - -namespace llvm { - -namespace AArch64BuildAttributes { - -/// AArch64 build attributes vendors IDs (a.k.a subsection name) -enum VendorID : unsigned { - AEABI_FEATURE_AND_BITS = 0, - AEABI_PAUTHABI = 1, - VENDOR_UNKNOWN = 404 // Treated as a private subsection name -}; -StringRef getVendorName(unsigned const Vendor); -VendorID getVendorID(StringRef const Vendor); - -enum SubsectionOptional : unsigned { - REQUIRED = 0, - OPTIONAL = 1, - OPTIONAL_NOT_FOUND = 404 -}; -StringRef getOptionalStr(unsigned Optional); -SubsectionOptional getOptionalID(StringRef Optional); -StringRef getSubsectionOptionalUnknownError(); - -enum SubsectionType : unsigned { ULEB128 = 0, NTBS = 1, TYPE_NOT_FOUND = 404 }; -StringRef getTypeStr(unsigned Type); -SubsectionType getTypeID(StringRef Type); -StringRef getSubsectionTypeUnknownError(); - -enum PauthABITags : unsigned { - TAG_PAUTH_PLATFORM = 1, - TAG_PAUTH_SCHEMA = 2, - PAUTHABI_TAG_NOT_FOUND = 404 -}; -StringRef getPauthABITagsStr(unsigned PauthABITag); -PauthABITags getPauthABITagsID(StringRef PauthABITag); - -enum FeatureAndBitsTags : unsigned { - TAG_FEATURE_BTI = 0, - TAG_FEATURE_PAC = 1, - TAG_FEATURE_GCS = 2, - FEATURE_AND_BITS_TAG_NOT_FOUND = 404 -}; -StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag); -FeatureAndBitsTags getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag); - -enum FeatureAndBitsFlag : unsigned { - Feature_BTI_Flag = 1 << 0, - Feature_PAC_Flag = 1 << 1, - Feature_GCS_Flag = 1 << 2 -}; -} // namespace AArch64BuildAttributes -} // namespace llvm - -#endif // LLVM_SUPPORT_AARCH64BUILDATTRIBUTES_H \ No newline at end of file diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index 282c82198507d7..64ab2b2ab58f5b 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -696,8 +696,8 @@ MCELFStreamer::getAttributeItem(unsigned Attribute) { return nullptr; } -size_t MCELFStreamer::calculateContentSize( - SmallVector &AttrsVec) const { +size_t +MCELFStreamer::calculateContentSize(SmallVector &AttrsVec) { size_t Result = 0; for (const AttributeItem &Item : AttrsVec) { switch (Item.Type) { @@ -783,67 +783,6 @@ void MCELFStreamer::createAttributesSection( AttrsVec.clear(); } -void MCELFStreamer::createAttributesWithSubsection( - MCSection *&AttributeSection, const Twine &Section, unsigned Type, - SmallVector &SubSectionVec) { - // - // [ NTBS: vendor-name - // - // ]* - // vendor-data expends to: - // * - if (0 == SubSectionVec.size()) { - return; - } - - // Switch section to AttributeSection or get/create the section. - if (AttributeSection) { - switchSection(AttributeSection); - } else { - AttributeSection = getContext().getELFSection(Section, Type, 0); - switchSection(AttributeSection); - - // Format version - emitInt8(0x41); - } - - for (AttributeSubSection &SubSection : SubSectionVec) { - // subsection-length + vendor-name + '\0' - const size_t VendorHeaderSize = 4 + SubSection.VendorName.size() + 1; - // optional + parameter-type - const size_t VendorParameters = 1 + 1; - const size_t ContentsSize = calculateContentSize(SubSection.Content); - - emitInt32(VendorHeaderSize + VendorParameters + ContentsSize); - emitBytes(SubSection.VendorName); - emitInt8(0); // '\0' - emitInt8(SubSection.IsOptional); - emitInt8(SubSection.ParameterType); - - for (AttributeItem &Item : SubSection.Content) { - emitULEB128IntValue(Item.Tag); - switch (Item.Type) { - default: - assert(0 && "Invalid attribute type"); - break; - case AttributeItem::NumericAttribute: - emitULEB128IntValue(Item.IntValue); - break; - case AttributeItem::TextAttribute: - emitBytes(Item.StringValue); - emitInt8(0); // '\0' - break; - case AttributeItem::NumericAndTextAttributes: - emitULEB128IntValue(Item.IntValue); - emitBytes(Item.StringValue); - emitInt8(0); // '\0' - break; - } - } - } - SubSectionVec.clear(); -} - MCStreamer *llvm::createELFStreamer(MCContext &Context, std::unique_ptr &&MAB, std::unique_ptr &&OW, diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp deleted file mode 100644 index ada34eb3f927d1..00000000000000 --- a/llvm/lib/Support/AArch64BuildAttributes.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//===-- AArch64BuildAttributes.cpp - AArch64 Build Attributes -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/AArch64BuildAttributes.h" -#include "llvm/ADT/StringSwitch.h" - -namespace llvm { -namespace AArch64BuildAttributes { - -StringRef getVendorName(unsigned Vendor) { - switch (Vendor) { - case AEABI_FEATURE_AND_BITS: - return "aeabi_feature_and_bits"; - case AEABI_PAUTHABI: - return "aeabi_pauthabi"; - case VENDOR_UNKNOWN: - return ""; - default: - assert(0 && "Vendor name error"); - return ""; - } -} -VendorID getVendorID(StringRef Vendor) { - return StringSwitch(Vendor) - .Case("aeabi_feature_and_bits", AEABI_FEATURE_AND_BITS) - .Case("aeabi_pauthabi", AEABI_PAUTHABI) - .Default(VENDOR_UNKNOWN); -} - -StringRef getOptionalStr(unsigned Optional) { - switch (Optional) { - case REQUIRED: - return "required"; - case OPTIONAL: - return "optional"; - case OPTIONAL_NOT_FOUND: - default: - return ""; - } -} -SubsectionOptional getOptionalID(StringRef Optional) { - return StringSwitch(Optional) - .Case("required", REQUIRED) - .Case("optional", OPTIONAL) - .Default(OPTIONAL_NOT_FOUND); -} -StringRef getSubsectionOptionalUnknownError() { - return "unknown AArch64 build attributes optionality, expected " - "required|optional"; -} - -StringRef getTypeStr(unsigned Type) { - switch (Type) { - case ULEB128: - return "uleb128"; - case NTBS: - return "ntbs"; - case TYPE_NOT_FOUND: - default: - return ""; - } -} -SubsectionType getTypeID(StringRef Type) { - return StringSwitch(Type) - .Cases("uleb128", "ULEB128", ULEB128) - .Cases("ntbs", "NTBS", NTBS) - .Default(TYPE_NOT_FOUND); -} -StringRef getSubsectionTypeUnknownError() { - return "unknown AArch64 build attributes type, expected uleb128|ntbs"; -} - -StringRef getPauthABITagsStr(unsigned PauthABITag) { - switch (PauthABITag) { - case TAG_PAUTH_PLATFORM: - return "Tag_PAuth_Platform"; - case TAG_PAUTH_SCHEMA: - return "Tag_PAuth_Schema"; - case PAUTHABI_TAG_NOT_FOUND: - default: - return ""; - } -} -PauthABITags getPauthABITagsID(StringRef PauthABITag) { - return StringSwitch(PauthABITag) - .Case("Tag_PAuth_Platform", TAG_PAUTH_PLATFORM) - .Case("Tag_PAuth_Schema", TAG_PAUTH_SCHEMA) - .Default(PAUTHABI_TAG_NOT_FOUND); -} - -StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) { - switch (FeatureAndBitsTag) { - case TAG_FEATURE_BTI: - return "Tag_Feature_BTI"; - case TAG_FEATURE_PAC: - return "Tag_Feature_PAC"; - case TAG_FEATURE_GCS: - return "Tag_Feature_GCS"; - case FEATURE_AND_BITS_TAG_NOT_FOUND: - default: - return ""; - } -} -FeatureAndBitsTags getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) { - return StringSwitch(FeatureAndBitsTag) - .Case("Tag_Feature_BTI", TAG_FEATURE_BTI) - .Case("Tag_Feature_PAC", TAG_FEATURE_PAC) - .Case("Tag_Feature_GCS", TAG_FEATURE_GCS) - .Default(FEATURE_AND_BITS_TAG_NOT_FOUND); -} -} // namespace AArch64BuildAttributes -} // namespace llvm diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 122240c27b1fcd..2ecaea4b02bf61 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -144,7 +144,6 @@ add_llvm_component_library(LLVMSupport APInt.cpp APSInt.cpp ARMBuildAttrs.cpp - AArch64BuildAttributes.cpp ARMAttributeParser.cpp ARMWinEH.cpp Allocator.cpp diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 8d8520c68232be..27e65d60122fd7 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -208,10 +208,6 @@ class AArch64AsmPrinter : public AsmPrinter { /// pseudo instructions. bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst); - // Emit Build Attributes - void emitAttributes(unsigned Flags, uint64_t PAuthABIPlatform, - uint64_t PAuthABIVersion, AArch64TargetStreamer *TS); - void EmitToStreamer(MCStreamer &S, const MCInst &Inst); void EmitToStreamer(const MCInst &Inst) { EmitToStreamer(*OutStreamer, Inst); @@ -349,53 +345,36 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { if (!TT.isOSBinFormatELF()) return; - // For emitting build attributes and .note.gnu.property section - auto *TS = - static_cast(OutStreamer->getTargetStreamer()); - // Assemble feature flags that may require creation of build attributes and a - // note section. - unsigned BAFlags = 0; - unsigned GNUFlags = 0; + // Assemble feature flags that may require creation of a note section. + unsigned Flags = 0; if (const auto *BTE = mdconst::extract_or_null( - M.getModuleFlag("branch-target-enforcement"))) { - if (!BTE->isZero()) { - BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_BTI_Flag; - GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI; - } - } + M.getModuleFlag("branch-target-enforcement"))) + if (!BTE->isZero()) + Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI; if (const auto *GCS = mdconst::extract_or_null( - M.getModuleFlag("guarded-control-stack"))) { - if (!GCS->isZero()) { - BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_GCS_Flag; - GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_GCS; - } - } + M.getModuleFlag("guarded-control-stack"))) + if (!GCS->isZero()) + Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_GCS; if (const auto *Sign = mdconst::extract_or_null( - M.getModuleFlag("sign-return-address"))) { - if (!Sign->isZero()) { - BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_PAC_Flag; - GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; - } - } + M.getModuleFlag("sign-return-address"))) + if (!Sign->isZero()) + Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; uint64_t PAuthABIPlatform = -1; if (const auto *PAP = mdconst::extract_or_null( - M.getModuleFlag("aarch64-elf-pauthabi-platform"))) { + M.getModuleFlag("aarch64-elf-pauthabi-platform"))) PAuthABIPlatform = PAP->getZExtValue(); - } - uint64_t PAuthABIVersion = -1; if (const auto *PAV = mdconst::extract_or_null( - M.getModuleFlag("aarch64-elf-pauthabi-version"))) { + M.getModuleFlag("aarch64-elf-pauthabi-version"))) PAuthABIVersion = PAV->getZExtValue(); - } - // Emit AArch64 Build Attributes - emitAttributes(BAFlags, PAuthABIPlatform, PAuthABIVersion, TS); // Emit a .note.gnu.property section with the flags. - TS->emitNoteSection(GNUFlags, PAuthABIPlatform, PAuthABIVersion); + auto *TS = + static_cast(OutStreamer->getTargetStreamer()); + TS->emitNoteSection(Flags, PAuthABIPlatform, PAuthABIVersion); } void AArch64AsmPrinter::emitFunctionHeaderComment() { @@ -468,58 +447,6 @@ void AArch64AsmPrinter::emitSled(const MachineInstr &MI, SledKind Kind) { recordSled(CurSled, MI, Kind, 2); } -void AArch64AsmPrinter::emitAttributes(unsigned Flags, - uint64_t PAuthABIPlatform, - uint64_t PAuthABIVersion, - AArch64TargetStreamer *TS) { - - PAuthABIPlatform = (uint64_t(-1) == PAuthABIPlatform) ? 0 : PAuthABIPlatform; - PAuthABIVersion = (uint64_t(-1) == PAuthABIVersion) ? 0 : PAuthABIVersion; - - if (PAuthABIPlatform || PAuthABIVersion) { - TS->emitAtributesSubsection( - AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_PAUTHABI), - AArch64BuildAttributes::SubsectionOptional::REQUIRED, - AArch64BuildAttributes::SubsectionType::ULEB128); - TS->emitAttribute(AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_PAUTHABI), - AArch64BuildAttributes::TAG_PAUTH_PLATFORM, - PAuthABIPlatform, "", false); - TS->emitAttribute(AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_PAUTHABI), - AArch64BuildAttributes::TAG_PAUTH_SCHEMA, PAuthABIVersion, - "", false); - } - - unsigned BTIValue = - (Flags & AArch64BuildAttributes::Feature_BTI_Flag) ? 1 : 0; - unsigned PACValue = - (Flags & AArch64BuildAttributes::Feature_PAC_Flag) ? 1 : 0; - unsigned GCSValue = - (Flags & AArch64BuildAttributes::Feature_GCS_Flag) ? 1 : 0; - - if (BTIValue || PACValue || GCSValue) { - TS->emitAtributesSubsection( - AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), - AArch64BuildAttributes::SubsectionOptional::OPTIONAL, - AArch64BuildAttributes::SubsectionType::ULEB128); - TS->emitAttribute(AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), - AArch64BuildAttributes::TAG_FEATURE_BTI, BTIValue, "", - false); - TS->emitAttribute(AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), - AArch64BuildAttributes::TAG_FEATURE_PAC, PACValue, "", - false); - TS->emitAttribute(AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), - AArch64BuildAttributes::TAG_FEATURE_GCS, GCSValue, "", - false); - } -} - // Emit the following code for Intrinsic::{xray_customevent,xray_typedevent} // (built-in functions __xray_customevent/__xray_typedevent). // diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index d3eda48f3276e9..92f9f7309f8ec0 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -42,7 +42,7 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/AArch64BuildAttributes.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -228,8 +228,6 @@ class AArch64AsmParser : public MCTargetAsmParser { bool parseDirectiveSEHClearUnwoundToCall(SMLoc L); bool parseDirectiveSEHPACSignLR(SMLoc L); bool parseDirectiveSEHSaveAnyReg(SMLoc L, bool Paired, bool Writeback); - bool parseDirectiveAeabiSubSectionHeader(SMLoc L); - bool parseDirectiveAeabiAArch64Attr(SMLoc L); bool validateInstruction(MCInst &Inst, SMLoc &IDLoc, SmallVectorImpl &Loc); @@ -6994,7 +6992,6 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { const MCContext::Environment Format = getContext().getObjectFileType(); bool IsMachO = Format == MCContext::IsMachO; bool IsCOFF = Format == MCContext::IsCOFF; - bool IsELF = Format == MCContext::IsELF; auto IDVal = DirectiveID.getIdentifier().lower(); SMLoc Loc = DirectiveID.getLoc(); @@ -7090,13 +7087,6 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveSEHSaveAnyReg(Loc, true, true); else return true; - } else if (IsELF) { - if (IDVal == ".aeabi_subsection") - parseDirectiveAeabiSubSectionHeader(Loc); - else if (IDVal == ".aeabi_attribute") - parseDirectiveAeabiAArch64Attr(Loc); - else - return true; } else return true; return false; @@ -7833,265 +7823,6 @@ bool AArch64AsmParser::parseDirectiveSEHSaveAnyReg(SMLoc L, bool Paired, return false; } -bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) { - // Expecting 3 AsmToken::Identifier after '.aeabi_subsection', a name and 2 - // parameters, e.g.: .aeabi_subsection (1)aeabi_feature_and_bits, (2)optional, - // (3)uleb128 separated by 2 commas. - MCAsmParser &Parser = getParser(); - - // Consume the name (subsection name) - StringRef SubsectionName; - AArch64BuildAttributes::VendorID SubsectionNameID; - if (Parser.getTok().is(AsmToken::Identifier)) { - SubsectionName = Parser.getTok().getIdentifier(); - SubsectionNameID = AArch64BuildAttributes::getVendorID(SubsectionName); - } else { - Error(Parser.getTok().getLoc(), "subsection name not found"); - return true; - } - Parser.Lex(); - // consume a comma - // parseComma() return *false* on success, and call Lex(), no need to call - // Lex() again. - if (Parser.parseComma()) { - return true; - } - - std::unique_ptr SubsectionExists = - getTargetStreamer().getAtributesSubsectionByName(SubsectionName); - - // Consume the first parameter (optionality parameter) - AArch64BuildAttributes::SubsectionOptional IsOptional; - // options: optional/required - if (Parser.getTok().is(AsmToken::Identifier)) { - StringRef Optionality = Parser.getTok().getIdentifier(); - IsOptional = AArch64BuildAttributes::getOptionalID(Optionality); - if (AArch64BuildAttributes::OPTIONAL_NOT_FOUND == IsOptional) { - Error(Parser.getTok().getLoc(), - AArch64BuildAttributes::getSubsectionOptionalUnknownError() + ": " + - Optionality); - return true; - } - if (SubsectionExists) { - if (IsOptional != SubsectionExists->IsOptional) { - Error(Parser.getTok().getLoc(), - "optionality mismatch! subsection '" + SubsectionName + - "' already exists with optionality defined as '" + - AArch64BuildAttributes::getOptionalStr( - SubsectionExists->IsOptional) + - "' and not '" + - AArch64BuildAttributes::getOptionalStr(IsOptional) + "'"); - return true; - } - } - } else { - Error(Parser.getTok().getLoc(), - "optionality parameter not found, expected required|optional"); - return true; - } - // Check for possible IsOptional unaccepted values for known subsections - if (AArch64BuildAttributes::AEABI_FEATURE_AND_BITS == SubsectionNameID) { - if (AArch64BuildAttributes::REQUIRED == IsOptional) { - Error(Parser.getTok().getLoc(), - "aeabi_feature_and_bits must be marked as optional"); - return true; - } - } - if (AArch64BuildAttributes::AEABI_PAUTHABI == SubsectionNameID) { - if (AArch64BuildAttributes::OPTIONAL == IsOptional) { - Error(Parser.getTok().getLoc(), - "aeabi_pauthabi must be marked as required"); - return true; - } - } - Parser.Lex(); - // consume a comma - if (Parser.parseComma()) { - return true; - } - - // Consume the second parameter (type parameter) - AArch64BuildAttributes::SubsectionType Type; - if (Parser.getTok().is(AsmToken::Identifier)) { - StringRef Name = Parser.getTok().getIdentifier(); - Type = AArch64BuildAttributes::getTypeID(Name); - if (AArch64BuildAttributes::TYPE_NOT_FOUND == Type) { - Error(Parser.getTok().getLoc(), - AArch64BuildAttributes::getSubsectionTypeUnknownError() + ": " + - Name); - return true; - } - if (SubsectionExists) { - if (Type != SubsectionExists->ParameterType) { - Error(Parser.getTok().getLoc(), - "type mismatch! subsection '" + SubsectionName + - "' already exists with type defined as '" + - AArch64BuildAttributes::getTypeStr( - SubsectionExists->ParameterType) + - "' and not '" + AArch64BuildAttributes::getTypeStr(Type) + - "'"); - return true; - } - } - } else { - Error(Parser.getTok().getLoc(), - "type parameter not found, expected uleb128|ntbs"); - return true; - } - // Check for possible unaccepted 'type' values for known subsections - if (AArch64BuildAttributes::AEABI_FEATURE_AND_BITS == SubsectionNameID || - AArch64BuildAttributes::AEABI_PAUTHABI == SubsectionNameID) { - if (AArch64BuildAttributes::NTBS == Type) { - Error(Parser.getTok().getLoc(), - SubsectionName + " must be marked as ULEB128"); - return true; - } - } - Parser.Lex(); - // Parsing finished, check for trailing tokens. - if (Parser.getTok().isNot(llvm::AsmToken::EndOfStatement)) { - Error(Parser.getTok().getLoc(), "unexpected token for AArch64 build " - "attributes subsection header directive"); - return true; - } - - getTargetStreamer().emitAtributesSubsection(SubsectionName, IsOptional, Type); - - return false; -} - -bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) { - // Expecting 2 Tokens: after '.aeabi_attribute', e.g.: - // .aeabi_attribute (1)Tag_Feature_BTI, (2)[uleb128|ntbs] - // separated by a comma. - MCAsmParser &Parser = getParser(); - - std::unique_ptr ActiveSubsection = - getTargetStreamer().getActiveAtributesSubsection(); - if (nullptr == ActiveSubsection) { - Error(Parser.getTok().getLoc(), - "no active subsection, build attribute can not be added"); - return true; - } - StringRef ActiveSubsectionName = ActiveSubsection->VendorName; - unsigned ActiveSubsectionType = ActiveSubsection->ParameterType; - - unsigned ActiveSubsectionID = AArch64BuildAttributes::VENDOR_UNKNOWN; - if (AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_PAUTHABI) == ActiveSubsectionName) - ActiveSubsectionID = AArch64BuildAttributes::AEABI_PAUTHABI; - if (AArch64BuildAttributes::getVendorName( - AArch64BuildAttributes::AEABI_FEATURE_AND_BITS) == - ActiveSubsectionName) - ActiveSubsectionID = AArch64BuildAttributes::AEABI_FEATURE_AND_BITS; - - StringRef TagStr = ""; - unsigned Tag; - if (Parser.getTok().is(AsmToken::Identifier)) { - TagStr = Parser.getTok().getIdentifier(); - switch (ActiveSubsectionID) { - default: - assert(0 && "Subsection name error"); - break; - case AArch64BuildAttributes::VENDOR_UNKNOWN: - // Private subsection, accept any tag. - break; - case AArch64BuildAttributes::AEABI_PAUTHABI: - Tag = AArch64BuildAttributes::getPauthABITagsID(TagStr); - if (AArch64BuildAttributes::PAUTHABI_TAG_NOT_FOUND == Tag) { - Error(Parser.getTok().getLoc(), "unknown AArch64 build attribute '" + - TagStr + "' for subsection '" + - ActiveSubsectionName + "'"); - return true; - } - break; - case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS: - Tag = AArch64BuildAttributes::getFeatureAndBitsTagsID(TagStr); - if (AArch64BuildAttributes::FEATURE_AND_BITS_TAG_NOT_FOUND == Tag) { - Error(Parser.getTok().getLoc(), "unknown AArch64 build attribute '" + - TagStr + "' for subsection '" + - ActiveSubsectionName + "'"); - return true; - } - break; - } - } else if (Parser.getTok().is(AsmToken::Integer)) { - Tag = getTok().getIntVal(); - } else { - Error(Parser.getTok().getLoc(), "AArch64 build attributes tag not found"); - return true; - } - Parser.Lex(); - // consume a comma - // parseComma() return *false* on success, and call Lex(), no need to call - // Lex() again. - if (Parser.parseComma()) { - return true; - } - - // Consume the second parameter (attribute value) - unsigned ValueInt = unsigned(-1); - std::string ValueStr = ""; - if (Parser.getTok().is(AsmToken::Integer)) { - if (AArch64BuildAttributes::NTBS == ActiveSubsectionType) { - Error( - Parser.getTok().getLoc(), - "active subsection type is NTBS (string), found ULEB128 (unsigned)"); - return true; - } - ValueInt = getTok().getIntVal(); - } else if (Parser.getTok().is(AsmToken::Identifier)) { - if (AArch64BuildAttributes::ULEB128 == ActiveSubsectionType) { - Error( - Parser.getTok().getLoc(), - "active subsection type is ULEB128 (unsigned), found NTBS (string)"); - return true; - } - ValueStr = Parser.getTok().getIdentifier(); - } else if (Parser.getTok().is(AsmToken::String)) { - if (AArch64BuildAttributes::ULEB128 == ActiveSubsectionType) { - Error( - Parser.getTok().getLoc(), - "active subsection type is ULEB128 (unsigned), found NTBS (string)"); - return true; - } - ValueStr = Parser.getTok().getString(); - } else { - Error(Parser.getTok().getLoc(), "AArch64 build attributes value not found"); - return true; - } - // Check for possible unaccepted values for known tags (AEABI_PAUTHABI, - // AEABI_FEATURE_AND_BITS) - if (!(ActiveSubsectionID == AArch64BuildAttributes::VENDOR_UNKNOWN) && - TagStr != "") { // TagStr was a recognized string - if (0 != ValueInt && 1 != ValueInt) { - Error(Parser.getTok().getLoc(), - "unknown AArch64 build attributes Value for Tag '" + TagStr + - "' options are 0|1"); - return true; - } - } - Parser.Lex(); - // Parsing finished, check for trailing tokens. - if (Parser.getTok().isNot(llvm::AsmToken::EndOfStatement)) { - Error(Parser.getTok().getLoc(), - "unexpected token for AArch64 build attributes tag and value " - "attribute directive"); - return true; - } - - if (unsigned(-1) != ValueInt) { - getTargetStreamer().emitAttribute(ActiveSubsectionName, Tag, ValueInt, "", - false); - } - - if ("" != ValueStr) { - getTargetStreamer().emitAttribute(ActiveSubsectionName, Tag, unsigned(-1), - ValueStr, false); - } - return false; -} - bool AArch64AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { // Try @AUTH expressions: they're more complex than the usual symbol variants. if (!parseAuthExpr(Res, EndLoc)) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 9f7a60074daeb9..5bae846824548b 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -33,7 +33,6 @@ #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCWinCOFFStreamer.h" -#include "llvm/Support/AArch64BuildAttributes.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" @@ -46,7 +45,6 @@ class AArch64ELFStreamer; class AArch64TargetAsmStreamer : public AArch64TargetStreamer { formatted_raw_ostream &OS; - std::string VendorTag; void emitInst(uint32_t Inst) override; @@ -150,137 +148,13 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer { OS << "\t.seh_save_any_reg_px\tq" << Reg << ", " << Offset << "\n"; } - void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value, - std::string String, bool Override) override { - - // AArch64 build attributes for assembly attribute form: - // .aeabi_attribute tag, value - if (unsigned(-1) == Value && "" == String) { - assert(0 && "Arguments error"); - return; - } - - unsigned VendorID = AArch64BuildAttributes::getVendorID(VendorName); - - switch (VendorID) { - default: - assert(0 && "Subsection name error"); - break; - case AArch64BuildAttributes::VENDOR_UNKNOWN: - if (unsigned(-1) != Value) { - OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value; - AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", - Override); - } - if ("" != String) { - OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << String; - AArch64TargetStreamer::emitAttribute(VendorName, Tag, unsigned(-1), - String, Override); - } - break; - // Note: AEABI_FEATURE_AND_BITS takes only unsigned values - case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS: - switch (Tag) { - default: // allow emitting any attribute by number - OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value; - // Keep the data structure consistent with the case of ELF emission - // (important for llvm-mc asm parsing) - AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", - Override); - break; - case AArch64BuildAttributes::TAG_FEATURE_BTI: - case AArch64BuildAttributes::TAG_FEATURE_GCS: - case AArch64BuildAttributes::TAG_FEATURE_PAC: - OS << "\t.aeabi_attribute" << "\t" - << AArch64BuildAttributes::getFeatureAndBitsTagsStr(Tag) << ", " - << Value; - AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", - Override); - break; - } - break; - // Note: AEABI_PAUTHABI takes only unsigned values - case AArch64BuildAttributes::AEABI_PAUTHABI: - switch (Tag) { - default: // allow emitting any attribute by number - OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value; - // Keep the data structure consistent with the case of ELF emission - // (important for llvm-mc asm parsing) - AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", - Override); - break; - case AArch64BuildAttributes::TAG_PAUTH_PLATFORM: - case AArch64BuildAttributes::TAG_PAUTH_SCHEMA: - OS << "\t.aeabi_attribute" << "\t" - << AArch64BuildAttributes::getPauthABITagsStr(Tag) << ", " << Value; - AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", - Override); - break; - } - break; - } - OS << "\n"; - } - - void emitAtributesSubsection( - StringRef SubsectionName, - AArch64BuildAttributes::SubsectionOptional Optional, - AArch64BuildAttributes::SubsectionType ParameterType) override { - // The AArch64 build attributes assembly subsection header format: - // ".aeabi_subsection name, optional, parameter type" - // optional: required (0) optional (1) - // parameter type: uleb128 or ULEB128 (0) ntbs or NTBS (1) - unsigned SubsectionID = AArch64BuildAttributes::getVendorID(SubsectionName); - - assert((0 == Optional || 1 == Optional) && - AArch64BuildAttributes::getSubsectionOptionalUnknownError().data()); - assert((0 == ParameterType || 1 == ParameterType) && - AArch64BuildAttributes::getSubsectionTypeUnknownError().data()); - - std::string SubsectionTag = ".aeabi_subsection"; - StringRef OptionalStr = getOptionalStr(Optional); - StringRef ParameterStr = getTypeStr(ParameterType); - - switch (SubsectionID) { - default: { - // Treated as a private subsection - break; - } - case AArch64BuildAttributes::AEABI_PAUTHABI: { - assert(AArch64BuildAttributes::REQUIRED == Optional && - "subsection .aeabi-pauthabi should be marked as " - "required and not as optional"); - assert(AArch64BuildAttributes::ULEB128 == ParameterType && - "subsection .aeabi-pauthabi should be " - "marked as uleb128 and not as ntbs"); - break; - } - case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS: { - assert(AArch64BuildAttributes::OPTIONAL == Optional && - "subsection .aeabi_feature_and_bits should be " - "marked as optional and not as required"); - assert(AArch64BuildAttributes::ULEB128 == ParameterType && - "subsection .aeabi_feature_and_bits should " - "be marked as uleb128 and not as ntbs"); - break; - } - } - OS << "\t" << SubsectionTag << "\t" << SubsectionName << ", " << OptionalStr - << ", " << ParameterStr; - // Keep the data structure consistent with the case of ELF emission - // (important for llvm-mc asm parsing) - AArch64TargetStreamer::emitAtributesSubsection(SubsectionName, Optional, - ParameterType); - OS << "\n"; - } - public: AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); }; AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) - : AArch64TargetStreamer(S), OS(OS) {} + : AArch64TargetStreamer(S), OS(OS) {} void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) { OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n"; @@ -420,23 +294,6 @@ AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { return static_cast(Streamer); } -void AArch64TargetELFStreamer::emitAtributesSubsection( - StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional, - AArch64BuildAttributes::SubsectionType ParameterType) { - AArch64TargetStreamer::emitAtributesSubsection(VendorName, IsOptional, - ParameterType); -} - -void AArch64TargetELFStreamer::emitAttribute(StringRef VendorName, unsigned Tag, - unsigned Value, std::string String, - bool Override) { - if (unsigned(-1) != Value) - AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", Override); - if ("" != String) - AArch64TargetStreamer::emitAttribute(VendorName, Tag, unsigned(-1), String, - Override); -} - void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { getStreamer().emitInst(Inst); } @@ -452,9 +309,6 @@ void AArch64TargetELFStreamer::finish() { MCContext &Ctx = S.getContext(); auto &Asm = S.getAssembler(); - S.emitAttributesSection(AttributeSection, ".ARM.attributes", - ELF::SHT_AARCH64_ATTRIBUTES, AttributeSubSections); - // If ImplicitMapSyms is specified, ensure that text sections end with // the A64 state while non-text sections end with the data state. When // sections are combined by the linker, the subsequent section will start with diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 74ffe5f97f1b69..7bd89c9e29a728 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -151,107 +151,3 @@ llvm::createAArch64ObjectTargetStreamer(MCStreamer &S, MCTargetStreamer *llvm::createAArch64NullTargetStreamer(MCStreamer &S) { return new AArch64TargetStreamer(S); } - -void AArch64TargetStreamer::emitAtributesSubsection( - StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional, - AArch64BuildAttributes::SubsectionType ParameterType) { - - // If exists, return. - for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { - if (VendorName == SubSection.VendorName) { - activateAtributesSubsection(VendorName); - return; - } - } - // else, add the subsection - MCELFStreamer::AttributeSubSection AttSubSection; - AttSubSection.VendorName = VendorName; - AttSubSection.IsOptional = IsOptional; - AttSubSection.ParameterType = ParameterType; - AttributeSubSections.push_back(AttSubSection); - activateAtributesSubsection(VendorName); -} - -std::unique_ptr -AArch64TargetStreamer::getActiveAtributesSubsection() { - for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { - if (SubSection.IsActive) { - return std::make_unique(SubSection); - } - } - return nullptr; -} - -std::unique_ptr -AArch64TargetStreamer::getAtributesSubsectionByName(StringRef Name) { - for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { - if (Name == SubSection.VendorName) { - return std::make_unique(SubSection); - } - } - return nullptr; -} - -void AArch64TargetStreamer::emitAttribute(StringRef VendorName, unsigned Tag, - unsigned Value, std::string String, - bool Override) { - - if (unsigned(-1) == Value && "" == String) { - assert(0 && "Arguments error"); - return; - } - if (AttributeSubSections.size() == 0) { - assert(0 && - "Can not add AArch64 build attribute: no AArch64 subsection exists"); - return; - } - - for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { - if (VendorName == SubSection.VendorName) { - if (!SubSection.IsActive) { - assert(0 && - "Can not add AArch64 build attribute: subsection is not active"); - return; - } - for (MCELFStreamer::AttributeItem &Item : SubSection.Content) { - if (Item.Tag == Tag) { - if (!Override) { - if ((unsigned(-1) != Value && Item.IntValue != Value) || - ("" != String && Item.StringValue != String)) { - assert(0 && - "Can not add AArch64 build attribute: An attribute with " - "the same tag and a different value already exists"); - return; - } else { - // Case Item.IntValue == Value, no need to emit twice - assert(0 && - "AArch64 build attribute: An attribute with the same tag " - "and a same value already exists"); - return; - } - } - } - } - if (unsigned(-1) != Value) - SubSection.Content.push_back(MCELFStreamer::AttributeItem( - MCELFStreamer::AttributeItem::NumericAttribute, Tag, Value, "")); - if ("" != String) - SubSection.Content.push_back(MCELFStreamer::AttributeItem( - MCELFStreamer::AttributeItem::TextAttribute, Tag, unsigned(-1), - String)); - return; - } - } - assert(0 && "Can not add AArch64 build attribute: required subsection does " - "not exist"); -} - -void AArch64TargetStreamer::activateAtributesSubsection(StringRef VendorName) { - for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { - if (VendorName == SubSection.VendorName) { - SubSection.IsActive = true; - } else { - SubSection.IsActive = false; - } - } -} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index b2b9afe8670738..1c0f5d848c00c6 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -10,12 +10,7 @@ #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H #include "AArch64MCExpr.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/IR/Instructions.h" -#include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/Support/AArch64BuildAttributes.h" -#include namespace { class AArch64ELFStreamer; @@ -94,24 +89,6 @@ class AArch64TargetStreamer : public MCTargetStreamer { virtual void emitARM64WinCFISaveAnyRegQX(unsigned Reg, int Offset) {} virtual void emitARM64WinCFISaveAnyRegQPX(unsigned Reg, int Offset) {} - /// Build attributes implementation - virtual void - emitAtributesSubsection(StringRef VendorName, - AArch64BuildAttributes::SubsectionOptional IsOptional, - AArch64BuildAttributes::SubsectionType ParameterType); - virtual void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value, - std::string String, bool Override); - void activateAtributesSubsection(StringRef VendorName); - std::unique_ptr - getActiveAtributesSubsection(); - std::unique_ptr - getAtributesSubsectionByName(StringRef Name); - void - insertAttributeInPlace(const MCELFStreamer::AttributeItem &Attr, - MCELFStreamer::AttributeSubSection &AttSubSection); - - SmallVector AttributeSubSections; - private: std::unique_ptr ConstantPools; }; @@ -120,15 +97,6 @@ class AArch64TargetELFStreamer : public AArch64TargetStreamer { private: AArch64ELFStreamer &getStreamer(); - MCSection *AttributeSection = nullptr; - - /// Build attributes implementation - void emitAtributesSubsection( - StringRef VendorName, - AArch64BuildAttributes::SubsectionOptional IsOptional, - AArch64BuildAttributes::SubsectionType ParameterType) override; - void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value, - std::string String, bool Override = false) override; void emitInst(uint32_t Inst) override; void emitDirectiveVariantPCS(MCSymbol *Symbol) override; void finish() override; diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll deleted file mode 100644 index 81ece7aec8793b..00000000000000 --- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc %s -o - | FileCheck %s --check-prefix=ASM -; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -; ASM: .text -; ASM-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 -; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 1 -; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 1 - -; ELF: Hex dump of section '.ARM.attributes': -; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu -; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000001 re_and_bits..... -; ELF-NEXT: 0x00000020 01010201 - - -target triple = "aarch64-unknown-none-elf" - -!llvm.module.flags = !{!1, !2, !3} - -!1 = !{i32 8, !"branch-target-enforcement", i32 1} -!2 = !{i32 8, !"guarded-control-stack", i32 1} -!3 = !{i32 8, !"sign-return-address", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll deleted file mode 100644 index e719e06553cc0f..00000000000000 --- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s | FileCheck %s --check-prefix=ASM -; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -; ASM: .text -; ASM-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 -; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 0 -; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 0 - -; ELF: Hex dump of section '.ARM.attributes': -; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu -; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000001 re_and_bits..... -; ELF-NEXT: 0x00000020 01000200 - - -target triple = "aarch64-unknown-none-elf" - -!llvm.module.flags = !{!1} - -!1 = !{i32 8, !"branch-target-enforcement", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll deleted file mode 100644 index 6f231025a11e33..00000000000000 --- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s | FileCheck %s --check-prefix=ASM -; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -; ASM: .text -; ASM-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 0 -; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 0 -; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 1 - -; ELF: Hex dump of section '.ARM.attributes': -; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu -; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... -; ELF-NEXT: 0x00000020 01000201 - - -target triple = "aarch64-unknown-none-elf" - -!llvm.module.flags = !{!1} - -!1 = !{i32 8, !"guarded-control-stack", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll deleted file mode 100644 index 54ff12655eb23f..00000000000000 --- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s | FileCheck %s --check-prefix=ASM -; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -; ASM: .text -; ASM-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 0 -; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 1 -; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 0 - -; ELF: Hex dump of section '.ARM.attributes': -; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu -; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... -; ELF-NEXT: 0x00000020 01010200 - - -target triple = "aarch64-unknown-none-elf" - -!llvm.module.flags = !{!1} - -!1 = !{i32 8, !"sign-return-address", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll deleted file mode 100644 index 7e41167e8fff57..00000000000000 --- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc < %s | FileCheck %s --check-prefix=ASM -; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -; ASM: .text -; ASM-NEXT: .aeabi_subsection aeabi_pauthabi, required, uleb128 -; ASM-NEXT: .aeabi_attribute Tag_PAuth_Platform, 2 -; ASM-NEXT: .aeabi_attribute Tag_PAuth_Schema, 31 - -; ELF: Hex dump of section '.ARM.attributes': -; ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth -; ELF-NEXT: 0x00000010 61626900 00000102 021f - - -target triple = "aarch64-unknown-none-elf" - -!llvm.module.flags = !{!1, !2} - -!1 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2} -!2 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31} diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s deleted file mode 100644 index a895821f3b05d1..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -// ASM: .text -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_attribute Tag_PAuth_Platform, 1 -// ASM: .aeabi_attribute Tag_PAuth_Schema, 1 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_BTI, 1 -// ASM: .aeabi_attribute Tag_Feature_PAC, 1 -// ASM: .aeabi_attribute Tag_Feature_GCS, 1 - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth -// ELF-NEXT: 0x00000010 61626900 00000101 02012300 00006165 abi.......#...ae -// ELF-NEXT: 0x00000020 6162695f 66656174 7572655f 616e645f abi_feature_and_ -// ELF-NEXT: 0x00000030 62697473 00010000 01010102 01 - - -.text -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute Tag_PAuth_Platform, 1 -.aeabi_attribute Tag_PAuth_Schema, 1 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_BTI, 1 -.aeabi_attribute Tag_Feature_PAC, 1 -.aeabi_attribute Tag_Feature_GCS, 1 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s deleted file mode 100644 index 25573a0cabeca6..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s +++ /dev/null @@ -1,19 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_BTI, 1 -// ASM: .aeabi_attribute Tag_Feature_PAC, 0 -// ASM: .aeabi_attribute Tag_Feature_GCS, 0 - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu -// ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000001 re_and_bits..... -// ELF-NEXT: 0x00000020 01000200 - - -.text -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_BTI, 1 -.aeabi_attribute Tag_Feature_PAC, 0 -.aeabi_attribute Tag_Feature_GCS, 0 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s deleted file mode 100644 index e8daec0525591f..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s +++ /dev/null @@ -1,70 +0,0 @@ -// RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s - -.aeabi_attribute Tag_Feature_BTI, 1 -// ERR: error: no active subsection, build attribute can not be added -// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 - -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute Tag_Feature_BTI, 1 -// ERR: error: unknown AArch64 build attribute 'Tag_Feature_BTI' for subsection 'aeabi_pauthabi' -// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 - -.aeabi_attribute Tag_PAuth_Platform, 4 -// ERR: error: unknown AArch64 build attributes Value for Tag 'Tag_PAuth_Platform' options are 0|1 -// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, 4 - -.aeabi_attribute a, 1 -// ERR: error: unknown AArch64 build attribute 'a' for subsection 'aeabi_pauthabi' -// ERR-NEXT: .aeabi_attribute a, 1 - -.aeabi_attribute Tag_PAuth_Platform, Tag_PAuth_Platform -// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) -// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, Tag_PAuth_Platform - -.aeabi_attribute Tag_PAuth_Platform, a -// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) -// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, a - -.aeabi_attribute Tag_PAuth_Platform, -// ERR: error: AArch64 build attributes value not found -// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, - -.aeabi_attribute Tag_PAuth_Platform -// ERR: error: expected comma -// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform - -.aeabi_attribute -// ERR: error: AArch64 build attributes tag not found -// ERR-NEXT: .aeabi_attribute - -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_PAuth_Platform, 1 -// ERR: unknown AArch64 build attribute 'Tag_PAuth_Platform' for subsection 'aeabi_feature_and_bits' - -.aeabi_attribute a, 1 -// ERR: error: unknown AArch64 build attribute 'a' for subsection 'aeabi_feature_and_bits' - -.aeabi_attribute Tag_Feature_BTI, Tag_Feature_BTI -// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) -// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, Tag_Feature_BTI - -.aeabi_attribute Tag_Feature_BTI, a -// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) -// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, a - -.aeabi_attribute Tag_Feature_BTI, -// ERR: error: AArch64 build attributes value not found -// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, - -.aeabi_attribute Tag_Feature_BTI -// ERR: error: expected comma -// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI - -.aeabi_attribute -// ERR: error: AArch64 build attributes tag not found -// ERR-NEXT: .aeabi_attribute - -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute Tag_PAuth_Platform, 1 some_text -// ERR: error: unexpected token for AArch64 build attributes tag and value attribute directive -// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, 1 some_text \ No newline at end of file diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s deleted file mode 100644 index 9e6dca341e9f86..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s +++ /dev/null @@ -1,61 +0,0 @@ -// RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s - -.aeabi_subsection aeabi_pauthabi, optional, uleb128 -// ERR: error: aeabi_pauthabi must be marked as required -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, optional, uleb128 - -.aeabi_subsection aeabi_pauthabi, required, ntbs -// ERR: error: aeabi_pauthabi must be marked as ULEB128 -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, ntbs - -.aeabi_subsection aeabi_feature_and_bits, required, uleb128 -// ERR: error: aeabi_feature_and_bits must be marked as optional -// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, required, uleb128 - -.aeabi_subsection aeabi_feature_and_bits, optional, ntbs -// ERR: error: aeabi_feature_and_bits must be marked as ULEB128 -// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, ntbs - -.aeabi_subsection 1, required, uleb128 -// ERR: error: subsection name not found -// ERR-NEXT: .aeabi_subsection 1, required, uleb128 - -.aeabi_subsection , required, uleb128 -// ERR: error: subsection name not found -// ERR-NEXT: .aeabi_subsection , required, uleb128 - -.aeabi_subsection aeabi_pauthabi, a, uleb128 -// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: a -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, a, uleb128 - -.aeabi_subsection aeabi_pauthabi, a, uleb128 -// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: a -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, a, uleb128 - -.aeabi_subsection aeabi_pauthabi, 1, uleb128 -// ERR: error: optionality parameter not found, expected required|optional -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, 1, uleb128 - -.aeabi_subsection aeabi_pauthabi, ,uleb128 -// ERR: error: optionality parameter not found, expected required|optional -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, ,uleb128 - -.aeabi_subsection aeabi_pauthabi,uleb128 -// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: uleb128 -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi,uleb128 - -.aeabi_subsection aeabi_pauthabi uleb128 -// ERR: expected comma -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi uleb128 - -.aeabi_subsection aeabi_pauthabi, required -// ERR: error: expected comma -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required - -.aeabi_subsection aeabi_pauthabi, required, -// ERR: error: type parameter not found, expected uleb128|ntbs -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, - -.aeabi_subsection aeabi_pauthabi, required, a -// ERR: error: unknown AArch64 build attributes type, expected uleb128|ntbs: a -// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, a diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s deleted file mode 100644 index 62789c514dc332..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s +++ /dev/null @@ -1,19 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_BTI, 0 -// ASM: .aeabi_attribute Tag_Feature_PAC, 0 -// ASM: .aeabi_attribute Tag_Feature_GCS, 1 - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu -// ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... -// ELF-NEXT: 0x00000020 01000201 - - -.text -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_BTI, 0 -.aeabi_attribute Tag_Feature_PAC, 0 -.aeabi_attribute Tag_Feature_GCS, 1 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s deleted file mode 100644 index 07c89670373de8..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -// ASM: .text -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_attribute Tag_PAuth_Platform, 0 -// ASM: .aeabi_attribute Tag_PAuth_Schema, 0 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_BTI, 0 -// ASM: .aeabi_attribute Tag_Feature_PAC, 0 -// ASM: .aeabi_attribute Tag_Feature_GCS, 0 - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth -// ELF-NEXT: 0x00000010 61626900 00000100 02002300 00006165 abi.......#...ae -// ELF-NEXT: 0x00000020 6162695f 66656174 7572655f 616e645f abi_feature_and_ -// ELF-NEXT: 0x00000030 62697473 00010000 00010002 00 - - -.text -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute Tag_PAuth_Platform, 0 -.aeabi_attribute Tag_PAuth_Schema, 0 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_BTI, 0 -.aeabi_attribute Tag_Feature_PAC, 0 -.aeabi_attribute Tag_Feature_GCS, 0 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s deleted file mode 100644 index 2cdae778df5de5..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s +++ /dev/null @@ -1,41 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM - -// ASM: .text -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_attribute 0, 1 -// ASM: .aeabi_attribute Tag_PAuth_Platform, 1 -// ASM: .aeabi_attribute Tag_PAuth_Schema, 1 -// ASM: .aeabi_attribute 3, 1 -// ASM: .aeabi_attribute 4, 1 -// ASM: .aeabi_attribute 5, 1 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_BTI, 1 -// ASM: .aeabi_attribute Tag_Feature_PAC, 1 -// ASM: .aeabi_attribute Tag_Feature_GCS, 1 -// ASM: .aeabi_attribute 3, 1 -// ASM: .aeabi_attribute 4, 1 -// ASM: .aeabi_attribute 5, 1 - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 41210000 00616561 62695f70 61757468 A!...aeabi_pauth -// ELF-NEXT: 0x00000010 61626900 00000001 01010201 03010401 abi............. -// ELF-NEXT: 0x00000020 05012900 00006165 6162695f 66656174 ..)...aeabi_feat -// ELF-NEXT: 0x00000030 7572655f 616e645f 62697473 00010000 ure_and_bits.... -// ELF-NEXT: 0x00000040 01010102 01030104 010501 - - -.text -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute 0, 1 -.aeabi_attribute 1, 1 -.aeabi_attribute 2, 1 -.aeabi_attribute 3, 1 -.aeabi_attribute 4, 1 -.aeabi_attribute 5, 1 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute 0, 1 -.aeabi_attribute 1, 1 -.aeabi_attribute 2, 1 -.aeabi_attribute 3, 1 -.aeabi_attribute 4, 1 -.aeabi_attribute 5, 1 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s deleted file mode 100644 index 08ea2173ab86ce..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s +++ /dev/null @@ -1,50 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -// ASM: .text -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_BTI, 1 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_attribute Tag_PAuth_Schema, 1 -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_attribute Tag_PAuth_Platform, 1 -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_GCS, 1 -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_PAC, 0 -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute 7, 1 -// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 -// ASM: .aeabi_attribute 7, 0 - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 411b0000 00616561 62695f70 61757468 A....aeabi_pauth -// ELF-NEXT: 0x00000010 61626900 00000201 01010700 25000000 abi.........%... -// ELF-NEXT: 0x00000020 61656162 695f6665 61747572 655f616e aeabi_feature_an -// ELF-NEXT: 0x00000030 645f6269 74730001 00000102 01010007 d_bits.......... -// ELF-NEXT: 0x00000040 01 - - -.text -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_BTI, 1 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute Tag_PAuth_Schema, 1 -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute Tag_PAuth_Platform, 1 -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_GCS, 1 -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_PAC, 0 -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute 7, 1 -.aeabi_subsection aeabi_pauthabi, required, uleb128 -.aeabi_attribute 7, 0 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s deleted file mode 100644 index 483cae0e09cc7c..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s +++ /dev/null @@ -1,19 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -// ASM: .aeabi_attribute Tag_Feature_BTI, 0 -// ASM: .aeabi_attribute Tag_Feature_PAC, 1 -// ASM: .aeabi_attribute Tag_Feature_GCS, 0 - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu -// ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... -// ELF-NEXT: 0x00000020 01010200 - - -.text -.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 -.aeabi_attribute Tag_Feature_BTI, 0 -.aeabi_attribute Tag_Feature_PAC, 1 -.aeabi_attribute Tag_Feature_GCS, 0 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s deleted file mode 100644 index 2b4cbcc721acd4..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s - -.aeabi_subsection private_subsection, optional, uleb128 - -.aeabi_subsection private_subsection, required, uleb128 -// ERR: error: optionality mismatch! subsection 'private_subsection' already exists with optionality defined as 'optional' and not 'required' -// ERR-NEXT: .aeabi_subsection private_subsection, required, uleb128 - -.aeabi_subsection private_subsection, optional, ntbs -// ERR: error: type mismatch! subsection 'private_subsection' already exists with type defined as 'uleb128' and not 'ntbs' -// ERR-NEXT: .aeabi_subsection private_subsection, optional, ntbs - -.aeabi_subsection private_subsection_1, optional, ntbs -.aeabi_attribute 324, 1 -// ERR: error: active subsection type is NTBS (string), found ULEB128 (unsigned) -// ERR-NEXT: .aeabi_attribute 324, 1 - -.aeabi_subsection foo, optional, uleb128 -.aeabi_subsection bar, optional, uleb128 -.aeabi_subsection foo, required, uleb128 -// ERR: error: optionality mismatch! subsection 'foo' already exists with optionality defined as 'optional' and not 'required' -// ERR-NEXT: .aeabi_subsection foo, required, uleb128 - -.aeabi_subsection goo, optional, ntbs -.aeabi_subsection zar, optional, ntbs -.aeabi_subsection goo, optional, uleb128 -// ERR: error: type mismatch! subsection 'goo' already exists with type defined as 'ntbs' and not 'uleb128' -// ERR-NEXT: .aeabi_subsection goo, optional, uleb128 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s deleted file mode 100644 index 229033a9f6b70d..00000000000000 --- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s +++ /dev/null @@ -1,51 +0,0 @@ -// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF - -// ASM: .aeabi_subsection private_subsection_1, optional, uleb128 -// ASM: .aeabi_attribute 12, 257 -// ASM: .aeabi_subsection private_subsection_2, required, uleb128 -// ASM: .aeabi_attribute 76, 257 -// ASM: .aeabi_subsection private_subsection_3, optional, ntbs -// ASM: .aeabi_attribute 34, hello_llvm -// ASM: .aeabi_subsection private_subsection_4, required, ntbs -// ASM: .aeabi_attribute 777, "hello_llvm" -// ASM: .aeabi_subsection private_subsection_1, optional, uleb128 -// ASM: .aeabi_attribute 876, 257 -// ASM: .aeabi_subsection private_subsection_2, required, uleb128 -// ASM: .aeabi_attribute 876, 257 -// ASM: .aeabi_subsection private_subsection_3, optional, ntbs -// ASM: .aeabi_attribute 876, "hello_llvm" -// ASM: .aeabi_subsection private_subsection_4, required, ntbs -// ASM: .aeabi_attribute 876, hello_llvm - -// ELF: Hex dump of section '.ARM.attributes': -// ELF-NEXT: 0x00000000 41220000 00707269 76617465 5f737562 A"...private_sub -// ELF-NEXT: 0x00000010 73656374 696f6e5f 31000100 0c8102ec section_1....... -// ELF-NEXT: 0x00000020 06810222 00000070 72697661 74655f73 ..."...private_s -// ELF-NEXT: 0x00000030 75627365 6374696f 6e5f3200 00004c81 ubsection_2...L. -// ELF-NEXT: 0x00000040 02ec0681 02360000 00707269 76617465 .....6...private -// ELF-NEXT: 0x00000050 5f737562 73656374 696f6e5f 33000101 _subsection_3... -// ELF-NEXT: 0x00000060 2268656c 6c6f5f6c 6c766d00 ec062268 "hello_llvm..."h -// ELF-NEXT: 0x00000070 656c6c6f 5f6c6c76 6d220037 00000070 ello_llvm".7...p -// ELF-NEXT: 0x00000080 72697661 74655f73 75627365 6374696f rivate_subsectio -// ELF-NEXT: 0x00000090 6e5f3400 00018906 2268656c 6c6f5f6c n_4....."hello_l -// ELF-NEXT: 0x000000a0 6c766d22 00ec0668 656c6c6f 5f6c6c76 lvm"...hello_llv -// ELF-NEXT: 0x000000b0 6d00 m. - - -.aeabi_subsection private_subsection_1, optional, uleb128 -.aeabi_attribute 12, 257 -.aeabi_subsection private_subsection_2, required, uleb128 -.aeabi_attribute 76, 257 -.aeabi_subsection private_subsection_3, optional, ntbs -.aeabi_attribute 34, hello_llvm -.aeabi_subsection private_subsection_4, required, ntbs -.aeabi_attribute 777, "hello_llvm" -.aeabi_subsection private_subsection_1, optional, uleb128 -.aeabi_attribute 876, 257 -.aeabi_subsection private_subsection_2, required, uleb128 -.aeabi_attribute 876, 257 -.aeabi_subsection private_subsection_3, optional, ntbs -.aeabi_attribute 876, "hello_llvm" -.aeabi_subsection private_subsection_4, required, ntbs -.aeabi_attribute 876, hello_llvm diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 5146d4141f29b6..d152aec19d1b58 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -33,7 +33,6 @@ static_library("Support") { "Windows", ] sources = [ - "AArch64BuildAttributes.cpp", "ABIBreak.cpp", "AMDGPUMetadata.cpp", "APFixedPoint.cpp", From 195a1fc5b05d7a42b2e3fa383edb9a7e8b34a9c5 Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Wed, 22 Jan 2025 13:13:40 -0500 Subject: [PATCH 024/208] Reapply "[Clang][Sema] Use the correct lookup context when building overloaded 'operator->' in the current instantiation (#104458)" (#109422) Reapplies #104458, fixing a bug that occurs when a class member access expression calls an `operator->` operator function that returns a non-dependent class type. --- clang/include/clang/Sema/Sema.h | 5 +-- clang/lib/Sema/SemaExprCXX.cpp | 21 ++------- clang/lib/Sema/SemaExprMember.cpp | 2 +- clang/lib/Sema/SemaOverload.cpp | 20 ++++++--- clang/lib/Sema/TreeTransform.h | 5 ++- .../temp.res/temp.dep/temp.dep.type/p4.cpp | 45 ++++++++++++++----- 6 files changed, 59 insertions(+), 39 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 9fa33d6ca76ba5..a2a47d535b8e06 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10608,9 +10608,8 @@ class Sema final : public SemaBase { /// BuildOverloadedArrowExpr - Build a call to an overloaded @c operator-> /// (if one exists), where @c Base is an expression of class type and /// @c Member is the name of the member we're trying to find. - ExprResult BuildOverloadedArrowExpr(Scope *S, Expr *Base, - SourceLocation OpLoc, - bool *NoArrowOperatorFound = nullptr); + ExprResult BuildOverloadedArrowExpr(Expr *Base, SourceLocation OpLoc, + bool *NoArrowOperatorFound); ExprResult BuildCXXMemberCallExpr(Expr *Exp, NamedDecl *FoundDecl, CXXConversionDecl *Method, diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 1e39d69e8b230f..0ebf5f54613926 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -7999,18 +7999,6 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, QualType BaseType = Base->getType(); MayBePseudoDestructor = false; - if (BaseType->isDependentType()) { - // If we have a pointer to a dependent type and are using the -> operator, - // the object type is the type that the pointer points to. We might still - // have enough information about that type to do something useful. - if (OpKind == tok::arrow) - if (const PointerType *Ptr = BaseType->getAs()) - BaseType = Ptr->getPointeeType(); - - ObjectType = ParsedType::make(BaseType); - MayBePseudoDestructor = true; - return Base; - } // C++ [over.match.oper]p8: // [...] When operator->returns, the operator-> is applied to the value @@ -8025,7 +8013,7 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, SmallVector OperatorArrows; CTypes.insert(Context.getCanonicalType(BaseType)); - while (BaseType->isRecordType()) { + while (BaseType->getAsRecordDecl()) { if (OperatorArrows.size() >= getLangOpts().ArrowDepth) { Diag(OpLoc, diag::err_operator_arrow_depth_exceeded) << StartingType << getLangOpts().ArrowDepth << Base->getSourceRange(); @@ -8036,7 +8024,7 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, } Result = BuildOverloadedArrowExpr( - S, Base, OpLoc, + Base, OpLoc, // When in a template specialization and on the first loop iteration, // potentially give the default diagnostic (with the fixit in a // separate note) instead of having the error reported back to here @@ -8100,7 +8088,7 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, // it's legal for the type to be incomplete if this is a pseudo-destructor // call. We'll do more incomplete-type checks later in the lookup process, // so just skip this check for ObjC types. - if (!BaseType->isRecordType()) { + if (BaseType->isDependentType() || !BaseType->isRecordType()) { ObjectType = ParsedType::make(BaseType); MayBePseudoDestructor = true; return Base; @@ -8111,8 +8099,7 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, // Unlike the object expression in other contexts, *this is not required to // be of complete type for purposes of class member access (5.2.5) outside // the member function body. - if (!BaseType->isDependentType() && - !isThisOutsideMemberFunctionBody(BaseType) && + if (!isThisOutsideMemberFunctionBody(BaseType) && RequireCompleteType(OpLoc, BaseType, diag::err_incomplete_member_access)) { return CreateRecoveryExpr(Base->getBeginLoc(), Base->getEndLoc(), {Base}); diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index d130e8b86bc56d..8326a4db0a7719 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -1357,7 +1357,7 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R, BaseType = Ptr->getPointeeType(); else if (BaseType->isFunctionType()) goto fail; - else if (BaseType->isDependentType()) + else if (BaseExpr.get()->isTypeDependent()) BaseType = S.Context.DependentTy; else if (BaseType->isRecordType()) { // Recover from arrow accesses to records, e.g.: diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 7e8811b5274efb..bf4c0288274ac7 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -15962,10 +15962,9 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj, return CheckForImmediateInvocation(MaybeBindToTemporary(TheCall), Method); } -ExprResult -Sema::BuildOverloadedArrowExpr(Scope *S, Expr *Base, SourceLocation OpLoc, - bool *NoArrowOperatorFound) { - assert(Base->getType()->isRecordType() && +ExprResult Sema::BuildOverloadedArrowExpr(Expr *Base, SourceLocation OpLoc, + bool *NoArrowOperatorFound) { + assert(Base->getType()->getAsRecordDecl() && "left-hand side must have class type"); if (checkPlaceholderForOverload(*this, Base)) @@ -15988,9 +15987,20 @@ Sema::BuildOverloadedArrowExpr(Scope *S, Expr *Base, SourceLocation OpLoc, return ExprError(); LookupResult R(*this, OpName, OpLoc, LookupOrdinaryName); - LookupQualifiedName(R, Base->getType()->castAs()->getDecl()); + LookupParsedName(R, /*S=*/nullptr, /*SS=*/nullptr, Base->getType()); R.suppressAccessDiagnostics(); + if (Base->getType()->isDependentType() && + (!R.empty() || R.wasNotFoundInCurrentInstantiation())) { + DeclarationNameInfo OpNameInfo(OpName, OpLoc); + ExprResult Fn = CreateUnresolvedLookupExpr( + /*NamingClass=*/nullptr, /*NNSLoc=*/NestedNameSpecifierLoc(), + OpNameInfo, R.asUnresolvedSet(), /*PerformADL=*/false); + return CXXOperatorCallExpr::Create(Context, OO_Arrow, Fn.get(), Base, + Context.DependentTy, VK_PRValue, OpLoc, + CurFPFeatureOverrides()); + } + for (LookupResult::iterator Oper = R.begin(), OperEnd = R.end(); Oper != OperEnd; ++Oper) { AddMethodCandidate(Oper.getPair(), Base->getType(), Base->Classify(Context), diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 7dc88a1ae23b98..c0363692a3eb72 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -17282,10 +17282,11 @@ ExprResult TreeTransform::RebuildCXXOperatorCallExpr( } else if (Op == OO_Arrow) { // It is possible that the type refers to a RecoveryExpr created earlier // in the tree transformation. - if (First->getType()->isDependentType()) + if (First->containsErrors()) return ExprError(); // -> is never a builtin operation. - return SemaRef.BuildOverloadedArrowExpr(nullptr, First, OpLoc); + return getSema().BuildOverloadedArrowExpr(First, OpLoc, + /*NoArrowOperatorFound=*/nullptr); } else if (Second == nullptr || isPostIncDec) { if (!First->getType()->isOverloadableType() || (Op == OO_Amp && getSema().isQualifiedMemberAccess(First))) { diff --git a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp index f32f49ef4539a5..03eda1f13feed7 100644 --- a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp +++ b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp @@ -484,16 +484,19 @@ namespace N4 { template struct A { void not_instantiated(A a, A b, T c) { - a->x; - b->x; + a->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + b->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} c->x; } void instantiated(A a, A b, T c) { - a->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} - // expected-error@-1 {{no member named 'x' in 'N4::A'}} - b->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} - // expected-error@-1 {{no member named 'x' in 'N4::A'}} + // FIXME: We should only emit a single diagnostic suggesting to use '.'! + a->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + // expected-error@-1 {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + // expected-error@-2 {{no member named 'x' in 'N4::A'}} + b->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + // expected-error@-1 {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + // expected-error@-2 {{no member named 'x' in 'N4::A'}} c->x; // expected-error {{member reference type 'int' is not a pointer}} } }; @@ -540,11 +543,10 @@ namespace N4 { a->T::f(); a->T::g(); - // FIXME: 'U' should be a dependent name, and its lookup context should be 'a.operator->()'! - a->U::x; // expected-error {{use of undeclared identifier 'U'}} - a->U::y; // expected-error {{use of undeclared identifier 'U'}} - a->U::f(); // expected-error {{use of undeclared identifier 'U'}} - a->U::g(); // expected-error {{use of undeclared identifier 'U'}} + a->U::x; + a->U::y; + a->U::f(); + a->U::g(); } void instantiated(D a) { @@ -605,3 +607,24 @@ namespace N5 { template void g(int); // expected-note {{in instantiation of}} } // namespace N5 + +namespace N6 { + struct A { + int x; + }; + + struct B { + A* operator->(); + }; + + struct C { + B y; + }; + + template + struct D : C { + void f() { + y->x; + } + }; +} // namespace N6 From 137d706739653304294adef84ed758e3e498d975 Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Wed, 22 Jan 2025 18:36:37 +0000 Subject: [PATCH 025/208] [SCEV] Do not attempt to collect loop guards for loops without predecessor. (#123662) Attempting to collect loop guards for loops without a predecessor can lead to non-terminating recursion trying to construct a SCEV. Fixes https://github.com/llvm/llvm-project/issues/122913. --- llvm/lib/Analysis/ScalarEvolution.cpp | 2 ++ ...t-guard-info-with-multiple-predecessors.ll | 28 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 7673c354817579..210c7cab965edb 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15328,6 +15328,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { BasicBlock *Header = L->getHeader(); BasicBlock *Pred = L->getLoopPredecessor(); LoopGuards Guards(SE); + if (!Pred) + return Guards; SmallPtrSet VisitedBlocks; collectFromBlock(SE, Guards, Header, Pred, VisitedBlocks); return Guards; diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll index 46dccf454f21ac..28035b05303db3 100644 --- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll +++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll @@ -336,3 +336,31 @@ exit: ret void } + +; Checks correct traversal for loops without a unique predecessor +; outside the loop. +define void @pr122913() { +; CHECK-LABEL: pr122913 +; CHECK-NEXT: Determining loop execution counts for: @pr122913 +; CHECK-NEXT: Loop %header: backedge-taken count is i1 false +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i1 false +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is i1 false +; CHECK-NEXT: Loop %header: Trip multiple is 1 +entry: + br i1 1, label %bb, label %header + +bb: + br i1 1, label %exit, label %header + +header: + %0 = phi i32 [ %1, %body ], [ 0, %bb ], [ 0, %entry ] + br label %body + +body: + %1 = add i32 %0, 1 + %2 = icmp ult i32 %1, 0 + br i1 %2, label %header, label %exit + +exit: + ret void +} From 146ee98caa9ab1f717216b08cfe72bd1ab2e0b8b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 22 Jan 2025 10:37:01 -0800 Subject: [PATCH 026/208] [RISCV] Remove duplicate WriteRes; // Handle CTI Pipeline. def : WriteRes; -def : WriteRes; -let Latency = 2 in { def : WriteRes; def : WriteRes; -} // Handle FPU Pipelines. def p8700FPQ : ProcResource<3> { let BufferSize = 16; } From ec15b242505a46ea7d195a6520fb869a80a2cd10 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 22 Jan 2025 10:37:56 -0800 Subject: [PATCH 027/208] [llvm][Support] Only enable backtrace test when it's enabled (#123852) rdar://138554797 --- llvm/test/CMakeLists.txt | 1 + llvm/test/Other/crash-stack-trace.ll | 2 +- llvm/test/lit.cfg.py | 3 +++ llvm/test/lit.site.cfg.py.in | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index c66075434f1583..231dcfa35a34c3 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -1,6 +1,7 @@ llvm_canonicalize_cmake_booleans( BUILD_SHARED_LIBS HAVE_OCAMLOPT + ENABLE_BACKTRACES LLVM_ENABLE_DIA_SDK LLVM_ENABLE_FFI LLVM_ENABLE_THREADS diff --git a/llvm/test/Other/crash-stack-trace.ll b/llvm/test/Other/crash-stack-trace.ll index 29e43fe8197c28..104933dffa0a94 100644 --- a/llvm/test/Other/crash-stack-trace.ll +++ b/llvm/test/Other/crash-stack-trace.ll @@ -1,4 +1,4 @@ -; REQUIRES: asserts +; REQUIRES: asserts, backtrace ; RUN: not --crash opt -passes=trigger-crash-module %s -disable-output 2>&1 | \ ; RUN: FileCheck %s --check-prefix=CHECK-MODULE diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 5a03a85386e0aa..b17d41fa11af7c 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -597,6 +597,9 @@ def have_ld64_plugin_support(): ) and not re.match(r"^arm64(e)?-apple-(macos|darwin)", config.target_triple): config.available_features.add("debug_frame") +if config.enable_backtrace: + config.available_features.add("backtrace") + if config.enable_threads: config.available_features.add("thread_support") diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 0968f6214772d0..0d02920323d2dd 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -41,6 +41,7 @@ config.have_httplib = @LLVM_ENABLE_HTTPLIB@ config.have_dia_sdk = @LLVM_ENABLE_DIA_SDK@ config.enable_ffi = @LLVM_ENABLE_FFI@ config.build_examples = @LLVM_BUILD_EXAMPLES@ +config.enable_backtrace = @ENABLE_BACKTRACES@ config.enable_threads = @LLVM_ENABLE_THREADS@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ From 589593254eede2f624f29390dc1018725e536505 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 22 Jan 2025 10:42:15 -0800 Subject: [PATCH 028/208] [RISCV][VLOpt] Reorganize visit order and worklist management (#123973) This implements a suggestion by Craig in PR #123878. We can move the worklist management out of the per-instruction work and do it once at the end of scanning all the instructions. This should reduce repeat visitation of the same instruction when no changes can be made. Note that this does not remove the inherent O(N^2) in the algorithm. We're still potentially visiiting every user of every def. I also included a guard for unreachable blocks since that had been mentioned as a possible cause. It seems we've rulled that out, but guarding for this case is still a good idea. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 123 ++++++++++++--------- 1 file changed, 68 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 54ca8ccd8d9e90..66d26bf5b11e2d 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -1292,53 +1292,60 @@ std::optional RISCVVLOptimizer::checkUsers(MachineInstr &MI) { return CommonVL; } -bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) { - SetVector Worklist; - Worklist.insert(&OrigMI); +bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) { + LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n"); - bool MadeChange = false; - while (!Worklist.empty()) { - MachineInstr &MI = *Worklist.pop_back_val(); - LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n"); + if (!isVectorRegClass(MI.getOperand(0).getReg(), MRI)) + return false; - if (!isVectorRegClass(MI.getOperand(0).getReg(), MRI)) - continue; + auto CommonVL = checkUsers(MI); + if (!CommonVL) + return false; - auto CommonVL = checkUsers(MI); - if (!CommonVL) - continue; + assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) && + "Expected VL to be an Imm or virtual Reg"); - assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) && - "Expected VL to be an Imm or virtual Reg"); + unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc()); + MachineOperand &VLOp = MI.getOperand(VLOpNum); - unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc()); - MachineOperand &VLOp = MI.getOperand(VLOpNum); + if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) { + LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n"); + return false; + } - if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) { - LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n"); - continue; - } + if (CommonVL->isImm()) { + LLVM_DEBUG(dbgs() << " Reduce VL from " << VLOp << " to " + << CommonVL->getImm() << " for " << MI << "\n"); + VLOp.ChangeToImmediate(CommonVL->getImm()); + return true; + } + const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg()); + if (!MDT->dominates(VLMI, &MI)) + return false; + LLVM_DEBUG( + dbgs() << " Reduce VL from " << VLOp << " to " + << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo()) + << " for " << MI << "\n"); - if (CommonVL->isImm()) { - LLVM_DEBUG(dbgs() << " Reduce VL from " << VLOp << " to " - << CommonVL->getImm() << " for " << MI << "\n"); - VLOp.ChangeToImmediate(CommonVL->getImm()); - } else { - const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg()); - if (!MDT->dominates(VLMI, &MI)) - continue; - LLVM_DEBUG( - dbgs() << " Reduce VL from " << VLOp << " to " - << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo()) - << " for " << MI << "\n"); + // All our checks passed. We can reduce VL. + VLOp.ChangeToRegister(CommonVL->getReg(), false); + return true; +} - // All our checks passed. We can reduce VL. - VLOp.ChangeToRegister(CommonVL->getReg(), false); - } +bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + MDT = &getAnalysis().getDomTree(); - MadeChange = true; + const RISCVSubtarget &ST = MF.getSubtarget(); + if (!ST.hasVInstructions()) + return false; - // Now add all inputs to this instruction to the worklist. + SetVector Worklist; + auto PushOperands = [this, &Worklist](MachineInstr &MI, + bool IgnoreSameBlock) { for (auto &Op : MI.operands()) { if (!Op.isReg() || !Op.isUse() || !Op.getReg().isVirtual()) continue; @@ -1351,34 +1358,40 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) { if (!isCandidate(*DefMI)) continue; + if (IgnoreSameBlock && DefMI->getParent() == MI.getParent()) + continue; + Worklist.insert(DefMI); } - } - - return MadeChange; -} - -bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - MRI = &MF.getRegInfo(); - MDT = &getAnalysis().getDomTree(); - - const RISCVSubtarget &ST = MF.getSubtarget(); - if (!ST.hasVInstructions()) - return false; + }; + // Do a first pass eagerly rewriting in roughly reverse instruction + // order, populate the worklist with any instructions we might need to + // revisit. We avoid adding definitions to the worklist if they're + // in the same block - we're about to visit them anyways. bool MadeChange = false; for (MachineBasicBlock &MBB : MF) { - // Visit instructions in reverse order. + // Avoid unreachable blocks as they have degenerate dominance + if (!MDT->isReachableFromEntry(&MBB)) + continue; + for (auto &MI : make_range(MBB.rbegin(), MBB.rend())) { if (!isCandidate(MI)) continue; - - MadeChange |= tryReduceVL(MI); + if (!tryReduceVL(MI)) + continue; + MadeChange = true; + PushOperands(MI, /*IgnoreSameBlock*/ true); } } + while (!Worklist.empty()) { + assert(MadeChange); + MachineInstr &MI = *Worklist.pop_back_val(); + if (!tryReduceVL(MI)) + continue; + PushOperands(MI, /*IgnoreSameBlock*/ false); + } + return MadeChange; } From 340706f311e088f51002593b8cc7291500ca024e Mon Sep 17 00:00:00 2001 From: Stefan Pintilie Date: Wed, 22 Jan 2025 13:44:20 -0500 Subject: [PATCH 029/208] [PowerPC] Fix saving of Link Register when using ROP Protect (#123101) An optimization was added that tries to move the uses of the mflr instruction away from the instruction itself. However, this doesn't work when we are using the hashst instruction because that instruction needs to be run before the stack frame is obtained. This patch disables moving instructions away from the mflr in the case where ROP protection is being used. --------- Co-authored-by: Lei Huang --- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 9 +- .../PowerPC/ppc64-rop-protection-aix.ll | 312 +++++++-------- .../CodeGen/PowerPC/ppc64-rop-protection.ll | 372 +++++++++--------- 3 files changed, 348 insertions(+), 345 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 39ebd7f8d0df20..ba775c4a679d08 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -646,7 +646,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, bool HasFP = hasFP(MF); bool HasBP = RegInfo->hasBasePointer(MF); bool HasRedZone = isPPC64 || !isSVR4ABI; - bool HasROPProtect = Subtarget.hasROPProtect(); + const bool HasROPProtect = Subtarget.hasROPProtect(); bool HasPrivileged = Subtarget.hasPrivileged(); Register SPReg = isPPC64 ? PPC::X1 : PPC::R1; @@ -908,8 +908,10 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, // in ScratchReg. // If the offset can not be encoded into the store instruction, we also have // to save LR here. + // If we are using ROP Protection we need to save the LR here as we cannot + // move the hashst instruction past the point where we get the stack frame. if (MustSaveLR && !HasFastMFLR && - (HasSTUX || !isInt<16>(FrameSize + LROffset))) + (HasSTUX || !isInt<16>(FrameSize + LROffset) || HasROPProtect)) SaveLR(LROffset); // If FrameSize <= TLI.getStackProbeSize(MF), as POWER ABI requires backchain @@ -1100,7 +1102,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, } // Save the LR now. - if (!HasSTUX && MustSaveLR && !HasFastMFLR && isInt<16>(FrameSize + LROffset)) + if (!HasSTUX && MustSaveLR && !HasFastMFLR && + isInt<16>(FrameSize + LROffset) && !HasROPProtect) SaveLR(LROffset + FrameSize); // Add Call Frame Information for the instructions we generated above. diff --git a/llvm/test/CodeGen/PowerPC/ppc64-rop-protection-aix.ll b/llvm/test/CodeGen/PowerPC/ppc64-rop-protection-aix.ll index 8955835f41ea6c..8df47808169bed 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-rop-protection-aix.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-rop-protection-aix.ll @@ -66,9 +66,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P9-LABEL: caller: ; BE-P9: # %bb.0: # %entry ; BE-P9-NEXT: mflr r0 -; BE-P9-NEXT: stdu r1, -128(r1) -; BE-P9-NEXT: std r0, 144(r1) +; BE-P9-NEXT: std r0, 16(r1) ; BE-P9-NEXT: hashst r0, -16(r1) +; BE-P9-NEXT: stdu r1, -128(r1) ; BE-P9-NEXT: std r31, 120(r1) # 8-byte Folded Spill ; BE-P9-NEXT: mr r31, r4 ; BE-P9-NEXT: bl .callee[PR] @@ -85,9 +85,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P8-LABEL: caller: ; BE-P8: # %bb.0: # %entry ; BE-P8-NEXT: mflr r0 -; BE-P8-NEXT: stdu r1, -128(r1) -; BE-P8-NEXT: std r0, 144(r1) +; BE-P8-NEXT: std r0, 16(r1) ; BE-P8-NEXT: hashst r0, -16(r1) +; BE-P8-NEXT: stdu r1, -128(r1) ; BE-P8-NEXT: std r31, 120(r1) # 8-byte Folded Spill ; BE-P8-NEXT: mr r31, r4 ; BE-P8-NEXT: bl .callee[PR] @@ -122,9 +122,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-32BIT-P9-LABEL: caller: ; BE-32BIT-P9: # %bb.0: # %entry ; BE-32BIT-P9-NEXT: mflr r0 -; BE-32BIT-P9-NEXT: stwu r1, -80(r1) -; BE-32BIT-P9-NEXT: stw r0, 88(r1) +; BE-32BIT-P9-NEXT: stw r0, 8(r1) ; BE-32BIT-P9-NEXT: hashst r0, -16(r1) +; BE-32BIT-P9-NEXT: stwu r1, -80(r1) ; BE-32BIT-P9-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: mr r31, r4 ; BE-32BIT-P9-NEXT: bl .callee[PR] @@ -140,9 +140,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-32BIT-P8-LABEL: caller: ; BE-32BIT-P8: # %bb.0: # %entry ; BE-32BIT-P8-NEXT: mflr r0 -; BE-32BIT-P8-NEXT: stwu r1, -80(r1) -; BE-32BIT-P8-NEXT: stw r0, 88(r1) +; BE-32BIT-P8-NEXT: stw r0, 8(r1) ; BE-32BIT-P8-NEXT: hashst r0, -16(r1) +; BE-32BIT-P8-NEXT: stwu r1, -80(r1) ; BE-32BIT-P8-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: mr r31, r4 ; BE-32BIT-P8-NEXT: bl .callee[PR] @@ -177,9 +177,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P9-PRIV-LABEL: caller: ; BE-P9-PRIV: # %bb.0: # %entry ; BE-P9-PRIV-NEXT: mflr r0 -; BE-P9-PRIV-NEXT: stdu r1, -128(r1) -; BE-P9-PRIV-NEXT: std r0, 144(r1) +; BE-P9-PRIV-NEXT: std r0, 16(r1) ; BE-P9-PRIV-NEXT: hashstp r0, -16(r1) +; BE-P9-PRIV-NEXT: stdu r1, -128(r1) ; BE-P9-PRIV-NEXT: std r31, 120(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: mr r31, r4 ; BE-P9-PRIV-NEXT: bl .callee[PR] @@ -196,9 +196,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P8-PRIV-LABEL: caller: ; BE-P8-PRIV: # %bb.0: # %entry ; BE-P8-PRIV-NEXT: mflr r0 -; BE-P8-PRIV-NEXT: stdu r1, -128(r1) -; BE-P8-PRIV-NEXT: std r0, 144(r1) +; BE-P8-PRIV-NEXT: std r0, 16(r1) ; BE-P8-PRIV-NEXT: hashstp r0, -16(r1) +; BE-P8-PRIV-NEXT: stdu r1, -128(r1) ; BE-P8-PRIV-NEXT: std r31, 120(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: mr r31, r4 ; BE-P8-PRIV-NEXT: bl .callee[PR] @@ -233,9 +233,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-32BIT-P9-PRIV-LABEL: caller: ; BE-32BIT-P9-PRIV: # %bb.0: # %entry ; BE-32BIT-P9-PRIV-NEXT: mflr r0 -; BE-32BIT-P9-PRIV-NEXT: stwu r1, -80(r1) -; BE-32BIT-P9-PRIV-NEXT: stw r0, 88(r1) +; BE-32BIT-P9-PRIV-NEXT: stw r0, 8(r1) ; BE-32BIT-P9-PRIV-NEXT: hashstp r0, -16(r1) +; BE-32BIT-P9-PRIV-NEXT: stwu r1, -80(r1) ; BE-32BIT-P9-PRIV-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: mr r31, r4 ; BE-32BIT-P9-PRIV-NEXT: bl .callee[PR] @@ -251,9 +251,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-32BIT-P8-PRIV-LABEL: caller: ; BE-32BIT-P8-PRIV: # %bb.0: # %entry ; BE-32BIT-P8-PRIV-NEXT: mflr r0 -; BE-32BIT-P8-PRIV-NEXT: stwu r1, -80(r1) -; BE-32BIT-P8-PRIV-NEXT: stw r0, 88(r1) +; BE-32BIT-P8-PRIV-NEXT: stw r0, 8(r1) ; BE-32BIT-P8-PRIV-NEXT: hashstp r0, -16(r1) +; BE-32BIT-P8-PRIV-NEXT: stwu r1, -80(r1) ; BE-32BIT-P8-PRIV-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: mr r31, r4 ; BE-32BIT-P8-PRIV-NEXT: bl .callee[PR] @@ -406,39 +406,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P9-LABEL: spill: ; BE-P9: # %bb.0: # %entry -; BE-P9-NEXT: mfcr r12 ; BE-P9-NEXT: mflr r0 +; BE-P9-NEXT: mfcr r12 ; BE-P9-NEXT: stw r12, 8(r1) -; BE-P9-NEXT: stdu r1, -624(r1) -; BE-P9-NEXT: std r0, 640(r1) +; BE-P9-NEXT: std r0, 16(r1) ; BE-P9-NEXT: hashst r0, -488(r1) +; BE-P9-NEXT: stdu r1, -624(r1) +; BE-P9-NEXT: lwz r4, 12(r3) ; BE-P9-NEXT: std r14, 336(r1) # 8-byte Folded Spill +; BE-P9-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-NEXT: stxv v20, 144(r1) # 16-byte Folded Spill ; BE-P9-NEXT: stxv v21, 160(r1) # 16-byte Folded Spill -; BE-P9-NEXT: lwz r4, 12(r3) -; BE-P9-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-NEXT: stxv v22, 176(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r16, 352(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r17, 360(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r19, 376(r1) # 8-byte Folded Spill +; BE-P9-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r20, 384(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r21, 392(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r22, 400(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r23, 408(r1) # 8-byte Folded Spill +; BE-P9-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r25, 424(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r26, 432(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r27, 440(r1) # 8-byte Folded Spill +; BE-P9-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P9-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P9-NEXT: std r30, 464(r1) # 8-byte Folded Spill @@ -533,62 +533,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P8-LABEL: spill: ; BE-P8: # %bb.0: # %entry +; BE-P8-NEXT: mflr r0 ; BE-P8-NEXT: mfcr r12 ; BE-P8-NEXT: stw r12, 8(r1) -; BE-P8-NEXT: mflr r0 +; BE-P8-NEXT: std r0, 16(r1) +; BE-P8-NEXT: hashst r0, -488(r1) ; BE-P8-NEXT: stdu r1, -624(r1) ; BE-P8-NEXT: li r4, 144 -; BE-P8-NEXT: std r0, 640(r1) -; BE-P8-NEXT: hashst r0, -488(r1) ; BE-P8-NEXT: std r14, 336(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r16, 352(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r17, 360(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 160 ; BE-P8-NEXT: std r19, 376(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r20, 384(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 160 ; BE-P8-NEXT: std r21, 392(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r22, 400(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r23, 408(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 176 ; BE-P8-NEXT: std r25, 424(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r26, 432(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 176 ; BE-P8-NEXT: std r27, 440(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r30, 464(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 192 ; BE-P8-NEXT: std r31, 472(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f14, 480(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 192 ; BE-P8-NEXT: stfd f15, 488(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f16, 496(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f17, 504(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f18, 512(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 208 ; BE-P8-NEXT: stfd f19, 520(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f20, 528(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 208 ; BE-P8-NEXT: stfd f21, 536(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f22, 544(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f23, 552(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f24, 560(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 224 ; BE-P8-NEXT: stfd f25, 568(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f26, 576(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 224 ; BE-P8-NEXT: stfd f27, 584(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f28, 592(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f29, 600(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f30, 608(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 240 ; BE-P8-NEXT: stfd f31, 616(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r3, 120(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 240 ; BE-P8-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; BE-P8-NEXT: li r4, 256 ; BE-P8-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill @@ -812,39 +812,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-32BIT-P9-LABEL: spill: ; BE-32BIT-P9: # %bb.0: # %entry -; BE-32BIT-P9-NEXT: mfcr r12 ; BE-32BIT-P9-NEXT: mflr r0 +; BE-32BIT-P9-NEXT: mfcr r12 ; BE-32BIT-P9-NEXT: stw r12, 4(r1) -; BE-32BIT-P9-NEXT: stwu r1, -496(r1) -; BE-32BIT-P9-NEXT: stw r0, 504(r1) +; BE-32BIT-P9-NEXT: stw r0, 8(r1) ; BE-32BIT-P9-NEXT: hashst r0, -424(r1) +; BE-32BIT-P9-NEXT: stwu r1, -496(r1) +; BE-32BIT-P9-NEXT: lwz r4, 12(r3) ; BE-32BIT-P9-NEXT: stw r13, 276(r1) # 4-byte Folded Spill +; BE-32BIT-P9-NEXT: stw r14, 280(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: stxv v20, 80(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stxv v21, 96(r1) # 16-byte Folded Spill -; BE-32BIT-P9-NEXT: lwz r4, 12(r3) -; BE-32BIT-P9-NEXT: stw r14, 280(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: stxv v22, 112(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r15, 284(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v23, 128(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r16, 288(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v24, 144(r1) # 16-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v23, 128(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r17, 292(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v25, 160(r1) # 16-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v24, 144(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r18, 296(r1) # 4-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v25, 160(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r19, 300(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v26, 176(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r20, 304(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v27, 192(r1) # 16-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v26, 176(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r21, 308(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v28, 208(r1) # 16-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v27, 192(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r22, 312(r1) # 4-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v28, 208(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r23, 316(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v29, 224(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r24, 320(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v30, 240(r1) # 16-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v29, 224(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r25, 324(r1) # 4-byte Folded Spill -; BE-32BIT-P9-NEXT: stxv v31, 256(r1) # 16-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v30, 240(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r26, 328(r1) # 4-byte Folded Spill +; BE-32BIT-P9-NEXT: stxv v31, 256(r1) # 16-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r27, 332(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r28, 336(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r29, 340(r1) # 4-byte Folded Spill @@ -940,62 +940,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-32BIT-P8-LABEL: spill: ; BE-32BIT-P8: # %bb.0: # %entry +; BE-32BIT-P8-NEXT: mflr r0 ; BE-32BIT-P8-NEXT: mfcr r12 ; BE-32BIT-P8-NEXT: stw r12, 4(r1) -; BE-32BIT-P8-NEXT: mflr r0 +; BE-32BIT-P8-NEXT: stw r0, 8(r1) +; BE-32BIT-P8-NEXT: hashst r0, -424(r1) ; BE-32BIT-P8-NEXT: stwu r1, -496(r1) ; BE-32BIT-P8-NEXT: li r4, 80 -; BE-32BIT-P8-NEXT: stw r0, 504(r1) -; BE-32BIT-P8-NEXT: hashst r0, -424(r1) ; BE-32BIT-P8-NEXT: stw r13, 276(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r14, 280(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r15, 284(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r16, 288(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r17, 292(r1) # 4-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 96 ; BE-32BIT-P8-NEXT: stw r18, 296(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r19, 300(r1) # 4-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 96 ; BE-32BIT-P8-NEXT: stw r20, 304(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r21, 308(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r22, 312(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r23, 316(r1) # 4-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 112 ; BE-32BIT-P8-NEXT: stw r24, 320(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r25, 324(r1) # 4-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 112 ; BE-32BIT-P8-NEXT: stw r26, 328(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r27, 332(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r28, 336(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r29, 340(r1) # 4-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 128 ; BE-32BIT-P8-NEXT: stw r30, 344(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r31, 348(r1) # 4-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 128 ; BE-32BIT-P8-NEXT: stfd f14, 352(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f15, 360(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f16, 368(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f17, 376(r1) # 8-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 144 ; BE-32BIT-P8-NEXT: stfd f18, 384(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f19, 392(r1) # 8-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 144 ; BE-32BIT-P8-NEXT: stfd f20, 400(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f21, 408(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f22, 416(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f23, 424(r1) # 8-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 160 ; BE-32BIT-P8-NEXT: stfd f24, 432(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f25, 440(r1) # 8-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 160 ; BE-32BIT-P8-NEXT: stfd f26, 448(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f27, 456(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f28, 464(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f29, 472(r1) # 8-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 176 ; BE-32BIT-P8-NEXT: stfd f30, 480(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f31, 488(r1) # 8-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 176 ; BE-32BIT-P8-NEXT: stw r3, 64(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; BE-32BIT-P8-NEXT: li r4, 192 @@ -1219,39 +1219,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P9-PRIV-LABEL: spill: ; BE-P9-PRIV: # %bb.0: # %entry -; BE-P9-PRIV-NEXT: mfcr r12 ; BE-P9-PRIV-NEXT: mflr r0 +; BE-P9-PRIV-NEXT: mfcr r12 ; BE-P9-PRIV-NEXT: stw r12, 8(r1) -; BE-P9-PRIV-NEXT: stdu r1, -624(r1) -; BE-P9-PRIV-NEXT: std r0, 640(r1) +; BE-P9-PRIV-NEXT: std r0, 16(r1) ; BE-P9-PRIV-NEXT: hashstp r0, -488(r1) +; BE-P9-PRIV-NEXT: stdu r1, -624(r1) +; BE-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-P9-PRIV-NEXT: std r14, 336(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: stxv v20, 144(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: stxv v21, 160(r1) # 16-byte Folded Spill -; BE-P9-PRIV-NEXT: lwz r4, 12(r3) -; BE-P9-PRIV-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: stxv v22, 176(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r16, 352(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r17, 360(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r19, 376(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r20, 384(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r21, 392(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r22, 400(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r23, 408(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r25, 424(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r26, 432(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r27, 440(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: std r30, 464(r1) # 8-byte Folded Spill @@ -1346,62 +1346,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P8-PRIV-LABEL: spill: ; BE-P8-PRIV: # %bb.0: # %entry +; BE-P8-PRIV-NEXT: mflr r0 ; BE-P8-PRIV-NEXT: mfcr r12 ; BE-P8-PRIV-NEXT: stw r12, 8(r1) -; BE-P8-PRIV-NEXT: mflr r0 +; BE-P8-PRIV-NEXT: std r0, 16(r1) +; BE-P8-PRIV-NEXT: hashstp r0, -488(r1) ; BE-P8-PRIV-NEXT: stdu r1, -624(r1) ; BE-P8-PRIV-NEXT: li r4, 144 -; BE-P8-PRIV-NEXT: std r0, 640(r1) -; BE-P8-PRIV-NEXT: hashstp r0, -488(r1) ; BE-P8-PRIV-NEXT: std r14, 336(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r16, 352(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r17, 360(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 160 ; BE-P8-PRIV-NEXT: std r19, 376(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r20, 384(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 160 ; BE-P8-PRIV-NEXT: std r21, 392(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r22, 400(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r23, 408(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 176 ; BE-P8-PRIV-NEXT: std r25, 424(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r26, 432(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 176 ; BE-P8-PRIV-NEXT: std r27, 440(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r30, 464(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 192 ; BE-P8-PRIV-NEXT: std r31, 472(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f14, 480(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 192 ; BE-P8-PRIV-NEXT: stfd f15, 488(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f16, 496(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f17, 504(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f18, 512(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 208 ; BE-P8-PRIV-NEXT: stfd f19, 520(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f20, 528(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 208 ; BE-P8-PRIV-NEXT: stfd f21, 536(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f22, 544(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f23, 552(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f24, 560(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 224 ; BE-P8-PRIV-NEXT: stfd f25, 568(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f26, 576(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 224 ; BE-P8-PRIV-NEXT: stfd f27, 584(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f28, 592(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f29, 600(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f30, 608(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 240 ; BE-P8-PRIV-NEXT: stfd f31, 616(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r3, 120(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 240 ; BE-P8-PRIV-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; BE-P8-PRIV-NEXT: li r4, 256 ; BE-P8-PRIV-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill @@ -1625,39 +1625,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-32BIT-P9-PRIV-LABEL: spill: ; BE-32BIT-P9-PRIV: # %bb.0: # %entry -; BE-32BIT-P9-PRIV-NEXT: mfcr r12 ; BE-32BIT-P9-PRIV-NEXT: mflr r0 +; BE-32BIT-P9-PRIV-NEXT: mfcr r12 ; BE-32BIT-P9-PRIV-NEXT: stw r12, 4(r1) -; BE-32BIT-P9-PRIV-NEXT: stwu r1, -496(r1) -; BE-32BIT-P9-PRIV-NEXT: stw r0, 504(r1) +; BE-32BIT-P9-PRIV-NEXT: stw r0, 8(r1) ; BE-32BIT-P9-PRIV-NEXT: hashstp r0, -424(r1) +; BE-32BIT-P9-PRIV-NEXT: stwu r1, -496(r1) +; BE-32BIT-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-32BIT-P9-PRIV-NEXT: stw r13, 276(r1) # 4-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stw r14, 280(r1) # 4-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stxv v20, 80(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stxv v21, 96(r1) # 16-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: lwz r4, 12(r3) -; BE-32BIT-P9-PRIV-NEXT: stw r14, 280(r1) # 4-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stxv v22, 112(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r15, 284(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v23, 128(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r16, 288(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v24, 144(r1) # 16-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v23, 128(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r17, 292(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v25, 160(r1) # 16-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v24, 144(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r18, 296(r1) # 4-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v25, 160(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r19, 300(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v26, 176(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r20, 304(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v27, 192(r1) # 16-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v26, 176(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r21, 308(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v28, 208(r1) # 16-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v27, 192(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r22, 312(r1) # 4-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v28, 208(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r23, 316(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v29, 224(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r24, 320(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v30, 240(r1) # 16-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v29, 224(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r25, 324(r1) # 4-byte Folded Spill -; BE-32BIT-P9-PRIV-NEXT: stxv v31, 256(r1) # 16-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v30, 240(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r26, 328(r1) # 4-byte Folded Spill +; BE-32BIT-P9-PRIV-NEXT: stxv v31, 256(r1) # 16-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r27, 332(r1) # 4-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r28, 336(r1) # 4-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: stw r29, 340(r1) # 4-byte Folded Spill @@ -1753,62 +1753,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-32BIT-P8-PRIV-LABEL: spill: ; BE-32BIT-P8-PRIV: # %bb.0: # %entry +; BE-32BIT-P8-PRIV-NEXT: mflr r0 ; BE-32BIT-P8-PRIV-NEXT: mfcr r12 ; BE-32BIT-P8-PRIV-NEXT: stw r12, 4(r1) -; BE-32BIT-P8-PRIV-NEXT: mflr r0 +; BE-32BIT-P8-PRIV-NEXT: stw r0, 8(r1) +; BE-32BIT-P8-PRIV-NEXT: hashstp r0, -424(r1) ; BE-32BIT-P8-PRIV-NEXT: stwu r1, -496(r1) ; BE-32BIT-P8-PRIV-NEXT: li r4, 80 -; BE-32BIT-P8-PRIV-NEXT: stw r0, 504(r1) -; BE-32BIT-P8-PRIV-NEXT: hashstp r0, -424(r1) ; BE-32BIT-P8-PRIV-NEXT: stw r13, 276(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r14, 280(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r15, 284(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r16, 288(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r17, 292(r1) # 4-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: li r4, 96 ; BE-32BIT-P8-PRIV-NEXT: stw r18, 296(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r19, 300(r1) # 4-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: li r4, 96 ; BE-32BIT-P8-PRIV-NEXT: stw r20, 304(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r21, 308(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r22, 312(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r23, 316(r1) # 4-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: li r4, 112 ; BE-32BIT-P8-PRIV-NEXT: stw r24, 320(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r25, 324(r1) # 4-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: li r4, 112 ; BE-32BIT-P8-PRIV-NEXT: stw r26, 328(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r27, 332(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r28, 336(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r29, 340(r1) # 4-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: li r4, 128 ; BE-32BIT-P8-PRIV-NEXT: stw r30, 344(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stw r31, 348(r1) # 4-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: li r4, 128 ; BE-32BIT-P8-PRIV-NEXT: stfd f14, 352(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f15, 360(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f16, 368(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f17, 376(r1) # 8-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: li r4, 144 ; BE-32BIT-P8-PRIV-NEXT: stfd f18, 384(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f19, 392(r1) # 8-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: li r4, 144 ; BE-32BIT-P8-PRIV-NEXT: stfd f20, 400(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f21, 408(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f22, 416(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f23, 424(r1) # 8-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: li r4, 160 ; BE-32BIT-P8-PRIV-NEXT: stfd f24, 432(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f25, 440(r1) # 8-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: li r4, 160 ; BE-32BIT-P8-PRIV-NEXT: stfd f26, 448(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f27, 456(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f28, 464(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f29, 472(r1) # 8-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-PRIV-NEXT: li r4, 176 ; BE-32BIT-P8-PRIV-NEXT: stfd f30, 480(r1) # 8-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stfd f31, 488(r1) # 8-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-PRIV-NEXT: li r4, 176 ; BE-32BIT-P8-PRIV-NEXT: stw r3, 64(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: li r4, 192 @@ -1954,12 +1954,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P9-NEXT: beq cr0, L..BB2_2 ; BE-P9-NEXT: # %bb.1: # %if.end ; BE-P9-NEXT: mflr r0 -; BE-P9-NEXT: stdu r1, -144(r1) -; BE-P9-NEXT: std r0, 160(r1) +; BE-P9-NEXT: std r0, 16(r1) ; BE-P9-NEXT: hashst r0, -16(r1) +; BE-P9-NEXT: stdu r1, -144(r1) +; BE-P9-NEXT: lwz r4, 12(r3) ; BE-P9-NEXT: std r31, 136(r1) # 8-byte Folded Spill ; BE-P9-NEXT: mr r31, r3 -; BE-P9-NEXT: lwz r4, 12(r3) ; BE-P9-NEXT: stw r4, 124(r1) ; BE-P9-NEXT: addi r4, r1, 124 ; BE-P9-NEXT: mr r3, r4 @@ -1984,12 +1984,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P8-NEXT: beq cr0, L..BB2_2 ; BE-P8-NEXT: # %bb.1: # %if.end ; BE-P8-NEXT: mflr r0 -; BE-P8-NEXT: stdu r1, -144(r1) -; BE-P8-NEXT: std r0, 160(r1) +; BE-P8-NEXT: std r0, 16(r1) ; BE-P8-NEXT: hashst r0, -16(r1) +; BE-P8-NEXT: stdu r1, -144(r1) +; BE-P8-NEXT: lwz r4, 12(r3) ; BE-P8-NEXT: std r31, 136(r1) # 8-byte Folded Spill ; BE-P8-NEXT: mr r31, r3 -; BE-P8-NEXT: lwz r4, 12(r3) ; BE-P8-NEXT: stw r4, 124(r1) ; BE-P8-NEXT: addi r4, r1, 124 ; BE-P8-NEXT: mr r3, r4 @@ -2043,12 +2043,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-32BIT-P9-NEXT: beq cr0, L..BB2_2 ; BE-32BIT-P9-NEXT: # %bb.1: # %if.end ; BE-32BIT-P9-NEXT: mflr r0 -; BE-32BIT-P9-NEXT: stwu r1, -80(r1) -; BE-32BIT-P9-NEXT: stw r0, 88(r1) +; BE-32BIT-P9-NEXT: stw r0, 8(r1) ; BE-32BIT-P9-NEXT: hashst r0, -16(r1) +; BE-32BIT-P9-NEXT: stwu r1, -80(r1) +; BE-32BIT-P9-NEXT: lwz r4, 12(r3) ; BE-32BIT-P9-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: mr r31, r3 -; BE-32BIT-P9-NEXT: lwz r4, 12(r3) ; BE-32BIT-P9-NEXT: stw r4, 60(r1) ; BE-32BIT-P9-NEXT: addi r4, r1, 60 ; BE-32BIT-P9-NEXT: mr r3, r4 @@ -2072,12 +2072,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-32BIT-P8-NEXT: beq cr0, L..BB2_2 ; BE-32BIT-P8-NEXT: # %bb.1: # %if.end ; BE-32BIT-P8-NEXT: mflr r0 -; BE-32BIT-P8-NEXT: stwu r1, -80(r1) -; BE-32BIT-P8-NEXT: stw r0, 88(r1) +; BE-32BIT-P8-NEXT: stw r0, 8(r1) ; BE-32BIT-P8-NEXT: hashst r0, -16(r1) +; BE-32BIT-P8-NEXT: stwu r1, -80(r1) +; BE-32BIT-P8-NEXT: lwz r4, 12(r3) ; BE-32BIT-P8-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: mr r31, r3 -; BE-32BIT-P8-NEXT: lwz r4, 12(r3) ; BE-32BIT-P8-NEXT: stw r4, 60(r1) ; BE-32BIT-P8-NEXT: addi r4, r1, 60 ; BE-32BIT-P8-NEXT: mr r3, r4 @@ -2131,12 +2131,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P9-PRIV-NEXT: beq cr0, L..BB2_2 ; BE-P9-PRIV-NEXT: # %bb.1: # %if.end ; BE-P9-PRIV-NEXT: mflr r0 -; BE-P9-PRIV-NEXT: stdu r1, -144(r1) -; BE-P9-PRIV-NEXT: std r0, 160(r1) +; BE-P9-PRIV-NEXT: std r0, 16(r1) ; BE-P9-PRIV-NEXT: hashstp r0, -16(r1) +; BE-P9-PRIV-NEXT: stdu r1, -144(r1) +; BE-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-P9-PRIV-NEXT: std r31, 136(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: mr r31, r3 -; BE-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-P9-PRIV-NEXT: stw r4, 124(r1) ; BE-P9-PRIV-NEXT: addi r4, r1, 124 ; BE-P9-PRIV-NEXT: mr r3, r4 @@ -2161,12 +2161,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P8-PRIV-NEXT: beq cr0, L..BB2_2 ; BE-P8-PRIV-NEXT: # %bb.1: # %if.end ; BE-P8-PRIV-NEXT: mflr r0 -; BE-P8-PRIV-NEXT: stdu r1, -144(r1) -; BE-P8-PRIV-NEXT: std r0, 160(r1) +; BE-P8-PRIV-NEXT: std r0, 16(r1) ; BE-P8-PRIV-NEXT: hashstp r0, -16(r1) +; BE-P8-PRIV-NEXT: stdu r1, -144(r1) +; BE-P8-PRIV-NEXT: lwz r4, 12(r3) ; BE-P8-PRIV-NEXT: std r31, 136(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: mr r31, r3 -; BE-P8-PRIV-NEXT: lwz r4, 12(r3) ; BE-P8-PRIV-NEXT: stw r4, 124(r1) ; BE-P8-PRIV-NEXT: addi r4, r1, 124 ; BE-P8-PRIV-NEXT: mr r3, r4 @@ -2220,12 +2220,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-32BIT-P9-PRIV-NEXT: beq cr0, L..BB2_2 ; BE-32BIT-P9-PRIV-NEXT: # %bb.1: # %if.end ; BE-32BIT-P9-PRIV-NEXT: mflr r0 -; BE-32BIT-P9-PRIV-NEXT: stwu r1, -80(r1) -; BE-32BIT-P9-PRIV-NEXT: stw r0, 88(r1) +; BE-32BIT-P9-PRIV-NEXT: stw r0, 8(r1) ; BE-32BIT-P9-PRIV-NEXT: hashstp r0, -16(r1) +; BE-32BIT-P9-PRIV-NEXT: stwu r1, -80(r1) +; BE-32BIT-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-32BIT-P9-PRIV-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P9-PRIV-NEXT: mr r31, r3 -; BE-32BIT-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-32BIT-P9-PRIV-NEXT: stw r4, 60(r1) ; BE-32BIT-P9-PRIV-NEXT: addi r4, r1, 60 ; BE-32BIT-P9-PRIV-NEXT: mr r3, r4 @@ -2249,12 +2249,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-32BIT-P8-PRIV-NEXT: beq cr0, L..BB2_2 ; BE-32BIT-P8-PRIV-NEXT: # %bb.1: # %if.end ; BE-32BIT-P8-PRIV-NEXT: mflr r0 -; BE-32BIT-P8-PRIV-NEXT: stwu r1, -80(r1) -; BE-32BIT-P8-PRIV-NEXT: stw r0, 88(r1) +; BE-32BIT-P8-PRIV-NEXT: stw r0, 8(r1) ; BE-32BIT-P8-PRIV-NEXT: hashstp r0, -16(r1) +; BE-32BIT-P8-PRIV-NEXT: stwu r1, -80(r1) +; BE-32BIT-P8-PRIV-NEXT: lwz r4, 12(r3) ; BE-32BIT-P8-PRIV-NEXT: stw r31, 76(r1) # 4-byte Folded Spill ; BE-32BIT-P8-PRIV-NEXT: mr r31, r3 -; BE-32BIT-P8-PRIV-NEXT: lwz r4, 12(r3) ; BE-32BIT-P8-PRIV-NEXT: stw r4, 60(r1) ; BE-32BIT-P8-PRIV-NEXT: addi r4, r1, 60 ; BE-32BIT-P8-PRIV-NEXT: mr r3, r4 diff --git a/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll b/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll index 1ad1483bd81a83..829bf0f0d052e1 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll @@ -84,9 +84,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; LE-P9: # %bb.0: # %entry ; LE-P9-NEXT: mflr r0 ; LE-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stdu r1, -64(r1) -; LE-P9-NEXT: std r0, 80(r1) +; LE-P9-NEXT: std r0, 16(r1) ; LE-P9-NEXT: hashst r0, -24(r1) +; LE-P9-NEXT: stdu r1, -64(r1) ; LE-P9-NEXT: mr r30, r4 ; LE-P9-NEXT: bl callee ; LE-P9-NEXT: nop @@ -103,9 +103,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; LE-P8: # %bb.0: # %entry ; LE-P8-NEXT: mflr r0 ; LE-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stdu r1, -64(r1) -; LE-P8-NEXT: std r0, 80(r1) +; LE-P8-NEXT: std r0, 16(r1) ; LE-P8-NEXT: hashst r0, -24(r1) +; LE-P8-NEXT: stdu r1, -64(r1) ; LE-P8-NEXT: mr r30, r4 ; LE-P8-NEXT: bl callee ; LE-P8-NEXT: nop @@ -144,9 +144,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; LE-P9-O0-LABEL: caller: ; LE-P9-O0: # %bb.0: # %entry ; LE-P9-O0-NEXT: mflr r0 -; LE-P9-O0-NEXT: stdu r1, -112(r1) -; LE-P9-O0-NEXT: std r0, 128(r1) +; LE-P9-O0-NEXT: std r0, 16(r1) ; LE-P9-O0-NEXT: hashst r0, -8(r1) +; LE-P9-O0-NEXT: stdu r1, -112(r1) ; LE-P9-O0-NEXT: # kill: def $r4 killed $r4 killed $x4 ; LE-P9-O0-NEXT: stw r4, 100(r1) # 4-byte Folded Spill ; LE-P9-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 @@ -165,9 +165,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; LE-P8-O0-LABEL: caller: ; LE-P8-O0: # %bb.0: # %entry ; LE-P8-O0-NEXT: mflr r0 -; LE-P8-O0-NEXT: stdu r1, -112(r1) -; LE-P8-O0-NEXT: std r0, 128(r1) +; LE-P8-O0-NEXT: std r0, 16(r1) ; LE-P8-O0-NEXT: hashst r0, -8(r1) +; LE-P8-O0-NEXT: stdu r1, -112(r1) ; LE-P8-O0-NEXT: # kill: def $r4 killed $r4 killed $x4 ; LE-P8-O0-NEXT: stw r4, 100(r1) # 4-byte Folded Spill ; LE-P8-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 @@ -205,9 +205,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P9-LABEL: caller: ; BE-P9: # %bb.0: # %entry ; BE-P9-NEXT: mflr r0 -; BE-P9-NEXT: stdu r1, -144(r1) -; BE-P9-NEXT: std r0, 160(r1) +; BE-P9-NEXT: std r0, 16(r1) ; BE-P9-NEXT: hashst r0, -24(r1) +; BE-P9-NEXT: stdu r1, -144(r1) ; BE-P9-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P9-NEXT: mr r30, r4 ; BE-P9-NEXT: bl callee @@ -224,9 +224,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P8-LABEL: caller: ; BE-P8: # %bb.0: # %entry ; BE-P8-NEXT: mflr r0 -; BE-P8-NEXT: stdu r1, -144(r1) -; BE-P8-NEXT: std r0, 160(r1) +; BE-P8-NEXT: std r0, 16(r1) ; BE-P8-NEXT: hashst r0, -24(r1) +; BE-P8-NEXT: stdu r1, -144(r1) ; BE-P8-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P8-NEXT: mr r30, r4 ; BE-P8-NEXT: bl callee @@ -260,9 +260,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-32BIT-P9-LABEL: caller: ; BE-32BIT-P9: # %bb.0: # %entry ; BE-32BIT-P9-NEXT: mflr r0 -; BE-32BIT-P9-NEXT: stwu r1, -32(r1) -; BE-32BIT-P9-NEXT: stw r0, 36(r1) +; BE-32BIT-P9-NEXT: stw r0, 4(r1) ; BE-32BIT-P9-NEXT: hashst r0, -16(r1) +; BE-32BIT-P9-NEXT: stwu r1, -32(r1) ; BE-32BIT-P9-NEXT: stw r30, 24(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: mr r30, r4 ; BE-32BIT-P9-NEXT: bl callee @@ -277,9 +277,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-32BIT-P8-LABEL: caller: ; BE-32BIT-P8: # %bb.0: # %entry ; BE-32BIT-P8-NEXT: mflr r0 -; BE-32BIT-P8-NEXT: stwu r1, -32(r1) -; BE-32BIT-P8-NEXT: stw r0, 36(r1) +; BE-32BIT-P8-NEXT: stw r0, 4(r1) ; BE-32BIT-P8-NEXT: hashst r0, -16(r1) +; BE-32BIT-P8-NEXT: stwu r1, -32(r1) ; BE-32BIT-P8-NEXT: stw r30, 24(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: mr r30, r4 ; BE-32BIT-P8-NEXT: bl callee @@ -313,9 +313,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; LE-P9-PRIV: # %bb.0: # %entry ; LE-P9-PRIV-NEXT: mflr r0 ; LE-P9-PRIV-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stdu r1, -64(r1) -; LE-P9-PRIV-NEXT: std r0, 80(r1) +; LE-P9-PRIV-NEXT: std r0, 16(r1) ; LE-P9-PRIV-NEXT: hashstp r0, -24(r1) +; LE-P9-PRIV-NEXT: stdu r1, -64(r1) ; LE-P9-PRIV-NEXT: mr r30, r4 ; LE-P9-PRIV-NEXT: bl callee ; LE-P9-PRIV-NEXT: nop @@ -332,9 +332,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; LE-P8-PRIV: # %bb.0: # %entry ; LE-P8-PRIV-NEXT: mflr r0 ; LE-P8-PRIV-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stdu r1, -64(r1) -; LE-P8-PRIV-NEXT: std r0, 80(r1) +; LE-P8-PRIV-NEXT: std r0, 16(r1) ; LE-P8-PRIV-NEXT: hashstp r0, -24(r1) +; LE-P8-PRIV-NEXT: stdu r1, -64(r1) ; LE-P8-PRIV-NEXT: mr r30, r4 ; LE-P8-PRIV-NEXT: bl callee ; LE-P8-PRIV-NEXT: nop @@ -369,9 +369,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P9-PRIV-LABEL: caller: ; BE-P9-PRIV: # %bb.0: # %entry ; BE-P9-PRIV-NEXT: mflr r0 -; BE-P9-PRIV-NEXT: stdu r1, -144(r1) -; BE-P9-PRIV-NEXT: std r0, 160(r1) +; BE-P9-PRIV-NEXT: std r0, 16(r1) ; BE-P9-PRIV-NEXT: hashstp r0, -24(r1) +; BE-P9-PRIV-NEXT: stdu r1, -144(r1) ; BE-P9-PRIV-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: mr r30, r4 ; BE-P9-PRIV-NEXT: bl callee @@ -388,9 +388,9 @@ define dso_local zeroext i32 @caller(i32 zeroext %in, i32 zeroext %add_after) #0 ; BE-P8-PRIV-LABEL: caller: ; BE-P8-PRIV: # %bb.0: # %entry ; BE-P8-PRIV-NEXT: mflr r0 -; BE-P8-PRIV-NEXT: stdu r1, -144(r1) -; BE-P8-PRIV-NEXT: std r0, 160(r1) +; BE-P8-PRIV-NEXT: std r0, 16(r1) ; BE-P8-PRIV-NEXT: hashstp r0, -24(r1) +; BE-P8-PRIV-NEXT: stdu r1, -144(r1) ; BE-P8-PRIV-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: mr r30, r4 ; BE-P8-PRIV-NEXT: bl callee @@ -542,39 +542,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; LE-P9-LABEL: spill: ; LE-P9: # %bb.0: # %entry -; LE-P9-NEXT: mfcr r12 ; LE-P9-NEXT: mflr r0 +; LE-P9-NEXT: mfcr r12 ; LE-P9-NEXT: stw r12, 8(r1) -; LE-P9-NEXT: stdu r1, -544(r1) -; LE-P9-NEXT: std r0, 560(r1) +; LE-P9-NEXT: std r0, 16(r1) ; LE-P9-NEXT: hashst r0, -488(r1) +; LE-P9-NEXT: stdu r1, -544(r1) +; LE-P9-NEXT: lwz r4, 12(r3) ; LE-P9-NEXT: std r14, 256(r1) # 8-byte Folded Spill +; LE-P9-NEXT: std r15, 264(r1) # 8-byte Folded Spill ; LE-P9-NEXT: stxv v20, 64(r1) # 16-byte Folded Spill ; LE-P9-NEXT: stxv v21, 80(r1) # 16-byte Folded Spill -; LE-P9-NEXT: lwz r4, 12(r3) -; LE-P9-NEXT: std r15, 264(r1) # 8-byte Folded Spill ; LE-P9-NEXT: stxv v22, 96(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r16, 272(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v23, 112(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r17, 280(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v24, 128(r1) # 16-byte Folded Spill +; LE-P9-NEXT: stxv v23, 112(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r18, 288(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v25, 144(r1) # 16-byte Folded Spill +; LE-P9-NEXT: stxv v24, 128(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r19, 296(r1) # 8-byte Folded Spill +; LE-P9-NEXT: stxv v25, 144(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r20, 304(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v26, 160(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r21, 312(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v27, 176(r1) # 16-byte Folded Spill +; LE-P9-NEXT: stxv v26, 160(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r22, 320(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v28, 192(r1) # 16-byte Folded Spill +; LE-P9-NEXT: stxv v27, 176(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r23, 328(r1) # 8-byte Folded Spill +; LE-P9-NEXT: stxv v28, 192(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r24, 336(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v29, 208(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r25, 344(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v30, 224(r1) # 16-byte Folded Spill +; LE-P9-NEXT: stxv v29, 208(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r26, 352(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stxv v31, 240(r1) # 16-byte Folded Spill +; LE-P9-NEXT: stxv v30, 224(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r27, 360(r1) # 8-byte Folded Spill +; LE-P9-NEXT: stxv v31, 240(r1) # 16-byte Folded Spill ; LE-P9-NEXT: std r28, 368(r1) # 8-byte Folded Spill ; LE-P9-NEXT: std r29, 376(r1) # 8-byte Folded Spill ; LE-P9-NEXT: std r30, 384(r1) # 8-byte Folded Spill @@ -669,62 +669,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; LE-P8-LABEL: spill: ; LE-P8: # %bb.0: # %entry +; LE-P8-NEXT: mflr r0 ; LE-P8-NEXT: mfcr r12 ; LE-P8-NEXT: stw r12, 8(r1) -; LE-P8-NEXT: mflr r0 +; LE-P8-NEXT: std r0, 16(r1) +; LE-P8-NEXT: hashst r0, -488(r1) ; LE-P8-NEXT: stdu r1, -544(r1) ; LE-P8-NEXT: li r4, 64 -; LE-P8-NEXT: std r0, 560(r1) -; LE-P8-NEXT: hashst r0, -488(r1) ; LE-P8-NEXT: std r14, 256(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r15, 264(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r16, 272(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r17, 280(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r18, 288(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; LE-P8-NEXT: li r4, 80 ; LE-P8-NEXT: std r19, 296(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r20, 304(r1) # 8-byte Folded Spill +; LE-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; LE-P8-NEXT: li r4, 80 ; LE-P8-NEXT: std r21, 312(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r22, 320(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r23, 328(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r24, 336(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; LE-P8-NEXT: li r4, 96 ; LE-P8-NEXT: std r25, 344(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r26, 352(r1) # 8-byte Folded Spill +; LE-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; LE-P8-NEXT: li r4, 96 ; LE-P8-NEXT: std r27, 360(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r28, 368(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r29, 376(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r30, 384(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; LE-P8-NEXT: li r4, 112 ; LE-P8-NEXT: std r31, 392(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f14, 400(r1) # 8-byte Folded Spill +; LE-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; LE-P8-NEXT: li r4, 112 ; LE-P8-NEXT: stfd f15, 408(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f16, 416(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f17, 424(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f18, 432(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; LE-P8-NEXT: li r4, 128 ; LE-P8-NEXT: stfd f19, 440(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f20, 448(r1) # 8-byte Folded Spill +; LE-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; LE-P8-NEXT: li r4, 128 ; LE-P8-NEXT: stfd f21, 456(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f22, 464(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f23, 472(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f24, 480(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; LE-P8-NEXT: li r4, 144 ; LE-P8-NEXT: stfd f25, 488(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f26, 496(r1) # 8-byte Folded Spill +; LE-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; LE-P8-NEXT: li r4, 144 ; LE-P8-NEXT: stfd f27, 504(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f28, 512(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f29, 520(r1) # 8-byte Folded Spill ; LE-P8-NEXT: stfd f30, 528(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; LE-P8-NEXT: li r4, 160 ; LE-P8-NEXT: stfd f31, 536(r1) # 8-byte Folded Spill ; LE-P8-NEXT: std r3, 40(r1) # 8-byte Folded Spill +; LE-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; LE-P8-NEXT: li r4, 160 ; LE-P8-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; LE-P8-NEXT: li r4, 176 ; LE-P8-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill @@ -951,9 +951,9 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; LE-P9-O0-NEXT: mflr r0 ; LE-P9-O0-NEXT: mfcr r12 ; LE-P9-O0-NEXT: stw r12, 8(r1) -; LE-P9-O0-NEXT: stdu r1, -608(r1) -; LE-P9-O0-NEXT: std r0, 624(r1) +; LE-P9-O0-NEXT: std r0, 16(r1) ; LE-P9-O0-NEXT: hashst r0, -488(r1) +; LE-P9-O0-NEXT: stdu r1, -608(r1) ; LE-P9-O0-NEXT: std r14, 320(r1) # 8-byte Folded Spill ; LE-P9-O0-NEXT: std r15, 328(r1) # 8-byte Folded Spill ; LE-P9-O0-NEXT: std r16, 336(r1) # 8-byte Folded Spill @@ -1079,9 +1079,9 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; LE-P8-O0-NEXT: mflr r0 ; LE-P8-O0-NEXT: mfcr r12 ; LE-P8-O0-NEXT: stw r12, 8(r1) -; LE-P8-O0-NEXT: stdu r1, -608(r1) -; LE-P8-O0-NEXT: std r0, 624(r1) +; LE-P8-O0-NEXT: std r0, 16(r1) ; LE-P8-O0-NEXT: hashst r0, -488(r1) +; LE-P8-O0-NEXT: stdu r1, -608(r1) ; LE-P8-O0-NEXT: std r14, 320(r1) # 8-byte Folded Spill ; LE-P8-O0-NEXT: std r15, 328(r1) # 8-byte Folded Spill ; LE-P8-O0-NEXT: std r16, 336(r1) # 8-byte Folded Spill @@ -1355,39 +1355,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P9-LABEL: spill: ; BE-P9: # %bb.0: # %entry -; BE-P9-NEXT: mfcr r12 ; BE-P9-NEXT: mflr r0 +; BE-P9-NEXT: mfcr r12 ; BE-P9-NEXT: stw r12, 8(r1) -; BE-P9-NEXT: stdu r1, -624(r1) -; BE-P9-NEXT: std r0, 640(r1) +; BE-P9-NEXT: std r0, 16(r1) ; BE-P9-NEXT: hashst r0, -488(r1) +; BE-P9-NEXT: stdu r1, -624(r1) +; BE-P9-NEXT: lwz r4, 12(r3) ; BE-P9-NEXT: std r14, 336(r1) # 8-byte Folded Spill +; BE-P9-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-NEXT: stxv v20, 144(r1) # 16-byte Folded Spill ; BE-P9-NEXT: stxv v21, 160(r1) # 16-byte Folded Spill -; BE-P9-NEXT: lwz r4, 12(r3) -; BE-P9-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-NEXT: stxv v22, 176(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r16, 352(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r17, 360(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r19, 376(r1) # 8-byte Folded Spill +; BE-P9-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r20, 384(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r21, 392(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r22, 400(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r23, 408(r1) # 8-byte Folded Spill +; BE-P9-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r25, 424(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r26, 432(r1) # 8-byte Folded Spill -; BE-P9-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill +; BE-P9-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r27, 440(r1) # 8-byte Folded Spill +; BE-P9-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill ; BE-P9-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P9-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P9-NEXT: std r30, 464(r1) # 8-byte Folded Spill @@ -1482,62 +1482,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P8-LABEL: spill: ; BE-P8: # %bb.0: # %entry +; BE-P8-NEXT: mflr r0 ; BE-P8-NEXT: mfcr r12 ; BE-P8-NEXT: stw r12, 8(r1) -; BE-P8-NEXT: mflr r0 +; BE-P8-NEXT: std r0, 16(r1) +; BE-P8-NEXT: hashst r0, -488(r1) ; BE-P8-NEXT: stdu r1, -624(r1) ; BE-P8-NEXT: li r4, 144 -; BE-P8-NEXT: std r0, 640(r1) -; BE-P8-NEXT: hashst r0, -488(r1) ; BE-P8-NEXT: std r14, 336(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r16, 352(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r17, 360(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 160 ; BE-P8-NEXT: std r19, 376(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r20, 384(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 160 ; BE-P8-NEXT: std r21, 392(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r22, 400(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r23, 408(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 176 ; BE-P8-NEXT: std r25, 424(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r26, 432(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 176 ; BE-P8-NEXT: std r27, 440(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r30, 464(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 192 ; BE-P8-NEXT: std r31, 472(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f14, 480(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 192 ; BE-P8-NEXT: stfd f15, 488(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f16, 496(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f17, 504(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f18, 512(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 208 ; BE-P8-NEXT: stfd f19, 520(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f20, 528(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 208 ; BE-P8-NEXT: stfd f21, 536(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f22, 544(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f23, 552(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f24, 560(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 224 ; BE-P8-NEXT: stfd f25, 568(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f26, 576(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 224 ; BE-P8-NEXT: stfd f27, 584(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f28, 592(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f29, 600(r1) # 8-byte Folded Spill ; BE-P8-NEXT: stfd f30, 608(r1) # 8-byte Folded Spill -; BE-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; BE-P8-NEXT: li r4, 240 ; BE-P8-NEXT: stfd f31, 616(r1) # 8-byte Folded Spill ; BE-P8-NEXT: std r3, 120(r1) # 8-byte Folded Spill +; BE-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; BE-P8-NEXT: li r4, 240 ; BE-P8-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; BE-P8-NEXT: li r4, 256 ; BE-P8-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill @@ -1759,10 +1759,10 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; BE-32BIT-P9-LABEL: spill: ; BE-32BIT-P9: # %bb.0: # %entry ; BE-32BIT-P9-NEXT: mflr r0 +; BE-32BIT-P9-NEXT: stw r0, 4(r1) +; BE-32BIT-P9-NEXT: hashst r0, -424(r1) ; BE-32BIT-P9-NEXT: stwu r1, -448(r1) ; BE-32BIT-P9-NEXT: mfcr r12 -; BE-32BIT-P9-NEXT: stw r0, 452(r1) -; BE-32BIT-P9-NEXT: hashst r0, -424(r1) ; BE-32BIT-P9-NEXT: stw r14, 232(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r15, 236(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: stw r16, 240(r1) # 4-byte Folded Spill @@ -1884,11 +1884,11 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; BE-32BIT-P8-LABEL: spill: ; BE-32BIT-P8: # %bb.0: # %entry ; BE-32BIT-P8-NEXT: mflr r0 +; BE-32BIT-P8-NEXT: stw r0, 4(r1) +; BE-32BIT-P8-NEXT: hashst r0, -424(r1) ; BE-32BIT-P8-NEXT: stwu r1, -448(r1) ; BE-32BIT-P8-NEXT: li r4, 32 ; BE-32BIT-P8-NEXT: mfcr r12 -; BE-32BIT-P8-NEXT: stw r0, 452(r1) -; BE-32BIT-P8-NEXT: hashst r0, -424(r1) ; BE-32BIT-P8-NEXT: stw r14, 232(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r15, 236(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r16, 240(r1) # 4-byte Folded Spill @@ -1917,26 +1917,26 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; BE-32BIT-P8-NEXT: stfd f14, 304(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f15, 312(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f16, 320(r1) # 8-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 96 ; BE-32BIT-P8-NEXT: stfd f17, 328(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f18, 336(r1) # 8-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 96 ; BE-32BIT-P8-NEXT: stfd f19, 344(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f20, 352(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f21, 360(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f22, 368(r1) # 8-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 112 ; BE-32BIT-P8-NEXT: stfd f23, 376(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f24, 384(r1) # 8-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 112 ; BE-32BIT-P8-NEXT: stfd f25, 392(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f26, 400(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f27, 408(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f28, 416(r1) # 8-byte Folded Spill -; BE-32BIT-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; BE-32BIT-P8-NEXT: li r4, 128 ; BE-32BIT-P8-NEXT: stfd f29, 424(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stfd f30, 432(r1) # 8-byte Folded Spill +; BE-32BIT-P8-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; BE-32BIT-P8-NEXT: li r4, 128 ; BE-32BIT-P8-NEXT: stfd f31, 440(r1) # 8-byte Folded Spill ; BE-32BIT-P8-NEXT: stw r3, 16(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill @@ -2158,39 +2158,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; LE-P9-PRIV-LABEL: spill: ; LE-P9-PRIV: # %bb.0: # %entry -; LE-P9-PRIV-NEXT: mfcr r12 ; LE-P9-PRIV-NEXT: mflr r0 +; LE-P9-PRIV-NEXT: mfcr r12 ; LE-P9-PRIV-NEXT: stw r12, 8(r1) -; LE-P9-PRIV-NEXT: stdu r1, -544(r1) -; LE-P9-PRIV-NEXT: std r0, 560(r1) +; LE-P9-PRIV-NEXT: std r0, 16(r1) ; LE-P9-PRIV-NEXT: hashstp r0, -488(r1) +; LE-P9-PRIV-NEXT: stdu r1, -544(r1) +; LE-P9-PRIV-NEXT: lwz r4, 12(r3) ; LE-P9-PRIV-NEXT: std r14, 256(r1) # 8-byte Folded Spill +; LE-P9-PRIV-NEXT: std r15, 264(r1) # 8-byte Folded Spill ; LE-P9-PRIV-NEXT: stxv v20, 64(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: stxv v21, 80(r1) # 16-byte Folded Spill -; LE-P9-PRIV-NEXT: lwz r4, 12(r3) -; LE-P9-PRIV-NEXT: std r15, 264(r1) # 8-byte Folded Spill ; LE-P9-PRIV-NEXT: stxv v22, 96(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r16, 272(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v23, 112(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r17, 280(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v24, 128(r1) # 16-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v23, 112(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r18, 288(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v25, 144(r1) # 16-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v24, 128(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r19, 296(r1) # 8-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v25, 144(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r20, 304(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v26, 160(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r21, 312(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v27, 176(r1) # 16-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v26, 160(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r22, 320(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v28, 192(r1) # 16-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v27, 176(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r23, 328(r1) # 8-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v28, 192(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r24, 336(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v29, 208(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r25, 344(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v30, 224(r1) # 16-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v29, 208(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r26, 352(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stxv v31, 240(r1) # 16-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v30, 224(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r27, 360(r1) # 8-byte Folded Spill +; LE-P9-PRIV-NEXT: stxv v31, 240(r1) # 16-byte Folded Spill ; LE-P9-PRIV-NEXT: std r28, 368(r1) # 8-byte Folded Spill ; LE-P9-PRIV-NEXT: std r29, 376(r1) # 8-byte Folded Spill ; LE-P9-PRIV-NEXT: std r30, 384(r1) # 8-byte Folded Spill @@ -2285,62 +2285,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; LE-P8-PRIV-LABEL: spill: ; LE-P8-PRIV: # %bb.0: # %entry +; LE-P8-PRIV-NEXT: mflr r0 ; LE-P8-PRIV-NEXT: mfcr r12 ; LE-P8-PRIV-NEXT: stw r12, 8(r1) -; LE-P8-PRIV-NEXT: mflr r0 +; LE-P8-PRIV-NEXT: std r0, 16(r1) +; LE-P8-PRIV-NEXT: hashstp r0, -488(r1) ; LE-P8-PRIV-NEXT: stdu r1, -544(r1) ; LE-P8-PRIV-NEXT: li r4, 64 -; LE-P8-PRIV-NEXT: std r0, 560(r1) -; LE-P8-PRIV-NEXT: hashstp r0, -488(r1) ; LE-P8-PRIV-NEXT: std r14, 256(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r15, 264(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r16, 272(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r17, 280(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r18, 288(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; LE-P8-PRIV-NEXT: li r4, 80 ; LE-P8-PRIV-NEXT: std r19, 296(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r20, 304(r1) # 8-byte Folded Spill +; LE-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; LE-P8-PRIV-NEXT: li r4, 80 ; LE-P8-PRIV-NEXT: std r21, 312(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r22, 320(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r23, 328(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r24, 336(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; LE-P8-PRIV-NEXT: li r4, 96 ; LE-P8-PRIV-NEXT: std r25, 344(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r26, 352(r1) # 8-byte Folded Spill +; LE-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; LE-P8-PRIV-NEXT: li r4, 96 ; LE-P8-PRIV-NEXT: std r27, 360(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r28, 368(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r29, 376(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r30, 384(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; LE-P8-PRIV-NEXT: li r4, 112 ; LE-P8-PRIV-NEXT: std r31, 392(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f14, 400(r1) # 8-byte Folded Spill +; LE-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; LE-P8-PRIV-NEXT: li r4, 112 ; LE-P8-PRIV-NEXT: stfd f15, 408(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f16, 416(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f17, 424(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f18, 432(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; LE-P8-PRIV-NEXT: li r4, 128 ; LE-P8-PRIV-NEXT: stfd f19, 440(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f20, 448(r1) # 8-byte Folded Spill +; LE-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; LE-P8-PRIV-NEXT: li r4, 128 ; LE-P8-PRIV-NEXT: stfd f21, 456(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f22, 464(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f23, 472(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f24, 480(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; LE-P8-PRIV-NEXT: li r4, 144 ; LE-P8-PRIV-NEXT: stfd f25, 488(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f26, 496(r1) # 8-byte Folded Spill +; LE-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; LE-P8-PRIV-NEXT: li r4, 144 ; LE-P8-PRIV-NEXT: stfd f27, 504(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f28, 512(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f29, 520(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: stfd f30, 528(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; LE-P8-PRIV-NEXT: li r4, 160 ; LE-P8-PRIV-NEXT: stfd f31, 536(r1) # 8-byte Folded Spill ; LE-P8-PRIV-NEXT: std r3, 40(r1) # 8-byte Folded Spill +; LE-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; LE-P8-PRIV-NEXT: li r4, 160 ; LE-P8-PRIV-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; LE-P8-PRIV-NEXT: li r4, 176 ; LE-P8-PRIV-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill @@ -2563,39 +2563,39 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P9-PRIV-LABEL: spill: ; BE-P9-PRIV: # %bb.0: # %entry -; BE-P9-PRIV-NEXT: mfcr r12 ; BE-P9-PRIV-NEXT: mflr r0 +; BE-P9-PRIV-NEXT: mfcr r12 ; BE-P9-PRIV-NEXT: stw r12, 8(r1) -; BE-P9-PRIV-NEXT: stdu r1, -624(r1) -; BE-P9-PRIV-NEXT: std r0, 640(r1) +; BE-P9-PRIV-NEXT: std r0, 16(r1) ; BE-P9-PRIV-NEXT: hashstp r0, -488(r1) +; BE-P9-PRIV-NEXT: stdu r1, -624(r1) +; BE-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-P9-PRIV-NEXT: std r14, 336(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: stxv v20, 144(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: stxv v21, 160(r1) # 16-byte Folded Spill -; BE-P9-PRIV-NEXT: lwz r4, 12(r3) -; BE-P9-PRIV-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: stxv v22, 176(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r16, 352(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r17, 360(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v23, 192(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v24, 208(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r19, 376(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v25, 224(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r20, 384(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r21, 392(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v26, 240(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r22, 400(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v27, 256(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r23, 408(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v28, 272(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r25, 424(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v29, 288(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r26, 432(r1) # 8-byte Folded Spill -; BE-P9-PRIV-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v30, 304(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r27, 440(r1) # 8-byte Folded Spill +; BE-P9-PRIV-NEXT: stxv v31, 320(r1) # 16-byte Folded Spill ; BE-P9-PRIV-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: std r30, 464(r1) # 8-byte Folded Spill @@ -2690,62 +2690,62 @@ define dso_local zeroext i32 @spill(ptr nocapture readonly %in) #0 { ; ; BE-P8-PRIV-LABEL: spill: ; BE-P8-PRIV: # %bb.0: # %entry +; BE-P8-PRIV-NEXT: mflr r0 ; BE-P8-PRIV-NEXT: mfcr r12 ; BE-P8-PRIV-NEXT: stw r12, 8(r1) -; BE-P8-PRIV-NEXT: mflr r0 +; BE-P8-PRIV-NEXT: std r0, 16(r1) +; BE-P8-PRIV-NEXT: hashstp r0, -488(r1) ; BE-P8-PRIV-NEXT: stdu r1, -624(r1) ; BE-P8-PRIV-NEXT: li r4, 144 -; BE-P8-PRIV-NEXT: std r0, 640(r1) -; BE-P8-PRIV-NEXT: hashstp r0, -488(r1) ; BE-P8-PRIV-NEXT: std r14, 336(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r15, 344(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r16, 352(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r17, 360(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r18, 368(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 160 ; BE-P8-PRIV-NEXT: std r19, 376(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r20, 384(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 160 ; BE-P8-PRIV-NEXT: std r21, 392(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r22, 400(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r23, 408(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r24, 416(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 176 ; BE-P8-PRIV-NEXT: std r25, 424(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r26, 432(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 176 ; BE-P8-PRIV-NEXT: std r27, 440(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r28, 448(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r29, 456(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r30, 464(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 192 ; BE-P8-PRIV-NEXT: std r31, 472(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f14, 480(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 192 ; BE-P8-PRIV-NEXT: stfd f15, 488(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f16, 496(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f17, 504(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f18, 512(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 208 ; BE-P8-PRIV-NEXT: stfd f19, 520(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f20, 528(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 208 ; BE-P8-PRIV-NEXT: stfd f21, 536(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f22, 544(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f23, 552(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f24, 560(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 224 ; BE-P8-PRIV-NEXT: stfd f25, 568(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f26, 576(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 224 ; BE-P8-PRIV-NEXT: stfd f27, 584(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f28, 592(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f29, 600(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: stfd f30, 608(r1) # 8-byte Folded Spill -; BE-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill -; BE-P8-PRIV-NEXT: li r4, 240 ; BE-P8-PRIV-NEXT: stfd f31, 616(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: std r3, 120(r1) # 8-byte Folded Spill +; BE-P8-PRIV-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill +; BE-P8-PRIV-NEXT: li r4, 240 ; BE-P8-PRIV-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill ; BE-P8-PRIV-NEXT: li r4, 256 ; BE-P8-PRIV-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill @@ -2890,11 +2890,11 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; LE-P9-NEXT: # %bb.1: # %if.end ; LE-P9-NEXT: mflr r0 ; LE-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P9-NEXT: stdu r1, -64(r1) -; LE-P9-NEXT: std r0, 80(r1) +; LE-P9-NEXT: std r0, 16(r1) ; LE-P9-NEXT: hashst r0, -24(r1) -; LE-P9-NEXT: mr r30, r3 +; LE-P9-NEXT: stdu r1, -64(r1) ; LE-P9-NEXT: lwz r4, 12(r3) +; LE-P9-NEXT: mr r30, r3 ; LE-P9-NEXT: stw r4, 36(r1) ; LE-P9-NEXT: addi r4, r1, 36 ; LE-P9-NEXT: mr r3, r4 @@ -2920,11 +2920,11 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; LE-P8-NEXT: # %bb.1: # %if.end ; LE-P8-NEXT: mflr r0 ; LE-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P8-NEXT: stdu r1, -64(r1) -; LE-P8-NEXT: std r0, 80(r1) +; LE-P8-NEXT: std r0, 16(r1) ; LE-P8-NEXT: hashst r0, -24(r1) -; LE-P8-NEXT: mr r30, r3 +; LE-P8-NEXT: stdu r1, -64(r1) ; LE-P8-NEXT: lwz r4, 12(r3) +; LE-P8-NEXT: mr r30, r3 ; LE-P8-NEXT: stw r4, 36(r1) ; LE-P8-NEXT: addi r4, r1, 36 ; LE-P8-NEXT: mr r3, r4 @@ -2978,9 +2978,9 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; LE-P9-O0-LABEL: shrinkwrap: ; LE-P9-O0: # %bb.0: # %entry ; LE-P9-O0-NEXT: mflr r0 -; LE-P9-O0-NEXT: stdu r1, -128(r1) -; LE-P9-O0-NEXT: std r0, 144(r1) +; LE-P9-O0-NEXT: std r0, 16(r1) ; LE-P9-O0-NEXT: hashst r0, -8(r1) +; LE-P9-O0-NEXT: stdu r1, -128(r1) ; LE-P9-O0-NEXT: mr. r4, r3 ; LE-P9-O0-NEXT: std r4, 104(r1) # 8-byte Folded Spill ; LE-P9-O0-NEXT: li r3, 0 @@ -3010,9 +3010,9 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; LE-P8-O0-LABEL: shrinkwrap: ; LE-P8-O0: # %bb.0: # %entry ; LE-P8-O0-NEXT: mflr r0 -; LE-P8-O0-NEXT: stdu r1, -128(r1) -; LE-P8-O0-NEXT: std r0, 144(r1) +; LE-P8-O0-NEXT: std r0, 16(r1) ; LE-P8-O0-NEXT: hashst r0, -8(r1) +; LE-P8-O0-NEXT: stdu r1, -128(r1) ; LE-P8-O0-NEXT: mr. r4, r3 ; LE-P8-O0-NEXT: std r4, 104(r1) # 8-byte Folded Spill ; LE-P8-O0-NEXT: li r3, 0 @@ -3075,12 +3075,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P9-NEXT: beq cr0, .LBB2_2 ; BE-P9-NEXT: # %bb.1: # %if.end ; BE-P9-NEXT: mflr r0 -; BE-P9-NEXT: stdu r1, -144(r1) -; BE-P9-NEXT: std r0, 160(r1) +; BE-P9-NEXT: std r0, 16(r1) ; BE-P9-NEXT: hashst r0, -24(r1) +; BE-P9-NEXT: stdu r1, -144(r1) +; BE-P9-NEXT: lwz r4, 12(r3) ; BE-P9-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P9-NEXT: mr r30, r3 -; BE-P9-NEXT: lwz r4, 12(r3) ; BE-P9-NEXT: stw r4, 116(r1) ; BE-P9-NEXT: addi r4, r1, 116 ; BE-P9-NEXT: mr r3, r4 @@ -3105,12 +3105,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P8-NEXT: beq cr0, .LBB2_2 ; BE-P8-NEXT: # %bb.1: # %if.end ; BE-P8-NEXT: mflr r0 -; BE-P8-NEXT: stdu r1, -144(r1) -; BE-P8-NEXT: std r0, 160(r1) +; BE-P8-NEXT: std r0, 16(r1) ; BE-P8-NEXT: hashst r0, -24(r1) +; BE-P8-NEXT: stdu r1, -144(r1) +; BE-P8-NEXT: lwz r4, 12(r3) ; BE-P8-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P8-NEXT: mr r30, r3 -; BE-P8-NEXT: lwz r4, 12(r3) ; BE-P8-NEXT: stw r4, 116(r1) ; BE-P8-NEXT: addi r4, r1, 116 ; BE-P8-NEXT: mr r3, r4 @@ -3161,10 +3161,10 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-32BIT-P9-LABEL: shrinkwrap: ; BE-32BIT-P9: # %bb.0: # %entry ; BE-32BIT-P9-NEXT: mflr r0 +; BE-32BIT-P9-NEXT: stw r0, 4(r1) +; BE-32BIT-P9-NEXT: hashst r0, -16(r1) ; BE-32BIT-P9-NEXT: stwu r1, -32(r1) ; BE-32BIT-P9-NEXT: cmplwi r3, 0 -; BE-32BIT-P9-NEXT: stw r0, 36(r1) -; BE-32BIT-P9-NEXT: hashst r0, -16(r1) ; BE-32BIT-P9-NEXT: stw r30, 24(r1) # 4-byte Folded Spill ; BE-32BIT-P9-NEXT: beq cr0, .LBB2_2 ; BE-32BIT-P9-NEXT: # %bb.1: # %if.end @@ -3190,10 +3190,10 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-32BIT-P8-LABEL: shrinkwrap: ; BE-32BIT-P8: # %bb.0: # %entry ; BE-32BIT-P8-NEXT: mflr r0 +; BE-32BIT-P8-NEXT: stw r0, 4(r1) +; BE-32BIT-P8-NEXT: hashst r0, -16(r1) ; BE-32BIT-P8-NEXT: stwu r1, -32(r1) ; BE-32BIT-P8-NEXT: cmplwi r3, 0 -; BE-32BIT-P8-NEXT: stw r0, 36(r1) -; BE-32BIT-P8-NEXT: hashst r0, -16(r1) ; BE-32BIT-P8-NEXT: stw r30, 24(r1) # 4-byte Folded Spill ; BE-32BIT-P8-NEXT: beq cr0, .LBB2_2 ; BE-32BIT-P8-NEXT: # %bb.1: # %if.end @@ -3252,11 +3252,11 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; LE-P9-PRIV-NEXT: # %bb.1: # %if.end ; LE-P9-PRIV-NEXT: mflr r0 ; LE-P9-PRIV-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P9-PRIV-NEXT: stdu r1, -64(r1) -; LE-P9-PRIV-NEXT: std r0, 80(r1) +; LE-P9-PRIV-NEXT: std r0, 16(r1) ; LE-P9-PRIV-NEXT: hashstp r0, -24(r1) -; LE-P9-PRIV-NEXT: mr r30, r3 +; LE-P9-PRIV-NEXT: stdu r1, -64(r1) ; LE-P9-PRIV-NEXT: lwz r4, 12(r3) +; LE-P9-PRIV-NEXT: mr r30, r3 ; LE-P9-PRIV-NEXT: stw r4, 36(r1) ; LE-P9-PRIV-NEXT: addi r4, r1, 36 ; LE-P9-PRIV-NEXT: mr r3, r4 @@ -3282,11 +3282,11 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; LE-P8-PRIV-NEXT: # %bb.1: # %if.end ; LE-P8-PRIV-NEXT: mflr r0 ; LE-P8-PRIV-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; LE-P8-PRIV-NEXT: stdu r1, -64(r1) -; LE-P8-PRIV-NEXT: std r0, 80(r1) +; LE-P8-PRIV-NEXT: std r0, 16(r1) ; LE-P8-PRIV-NEXT: hashstp r0, -24(r1) -; LE-P8-PRIV-NEXT: mr r30, r3 +; LE-P8-PRIV-NEXT: stdu r1, -64(r1) ; LE-P8-PRIV-NEXT: lwz r4, 12(r3) +; LE-P8-PRIV-NEXT: mr r30, r3 ; LE-P8-PRIV-NEXT: stw r4, 36(r1) ; LE-P8-PRIV-NEXT: addi r4, r1, 36 ; LE-P8-PRIV-NEXT: mr r3, r4 @@ -3341,12 +3341,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P9-PRIV-NEXT: beq cr0, .LBB2_2 ; BE-P9-PRIV-NEXT: # %bb.1: # %if.end ; BE-P9-PRIV-NEXT: mflr r0 -; BE-P9-PRIV-NEXT: stdu r1, -144(r1) -; BE-P9-PRIV-NEXT: std r0, 160(r1) +; BE-P9-PRIV-NEXT: std r0, 16(r1) ; BE-P9-PRIV-NEXT: hashstp r0, -24(r1) +; BE-P9-PRIV-NEXT: stdu r1, -144(r1) +; BE-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-P9-PRIV-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P9-PRIV-NEXT: mr r30, r3 -; BE-P9-PRIV-NEXT: lwz r4, 12(r3) ; BE-P9-PRIV-NEXT: stw r4, 116(r1) ; BE-P9-PRIV-NEXT: addi r4, r1, 116 ; BE-P9-PRIV-NEXT: mr r3, r4 @@ -3371,12 +3371,12 @@ define dso_local zeroext i32 @shrinkwrap(ptr readonly %in) #0 { ; BE-P8-PRIV-NEXT: beq cr0, .LBB2_2 ; BE-P8-PRIV-NEXT: # %bb.1: # %if.end ; BE-P8-PRIV-NEXT: mflr r0 -; BE-P8-PRIV-NEXT: stdu r1, -144(r1) -; BE-P8-PRIV-NEXT: std r0, 160(r1) +; BE-P8-PRIV-NEXT: std r0, 16(r1) ; BE-P8-PRIV-NEXT: hashstp r0, -24(r1) +; BE-P8-PRIV-NEXT: stdu r1, -144(r1) +; BE-P8-PRIV-NEXT: lwz r4, 12(r3) ; BE-P8-PRIV-NEXT: std r30, 128(r1) # 8-byte Folded Spill ; BE-P8-PRIV-NEXT: mr r30, r3 -; BE-P8-PRIV-NEXT: lwz r4, 12(r3) ; BE-P8-PRIV-NEXT: stw r4, 116(r1) ; BE-P8-PRIV-NEXT: addi r4, r1, 116 ; BE-P8-PRIV-NEXT: mr r3, r4 From 1687aa2a996f4059f275c83d5db635d43165d36c Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Wed, 22 Jan 2025 13:49:54 -0500 Subject: [PATCH 030/208] [RISCV][VLOPT] Don't reduce the VL is the same as CommonVL (#123878) This fixes the slowdown in #123862. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 6 +++++ llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll | 27 ++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 66d26bf5b11e2d..fc3300247b1909 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -1313,6 +1313,12 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) { return false; } + if (CommonVL->isIdenticalTo(VLOp)) { + LLVM_DEBUG( + dbgs() << " Abort due to CommonVL == VLOp, no point in reducing.\n"); + return false; + } + if (CommonVL->isImm()) { LLVM_DEBUG(dbgs() << " Reduce VL from " << VLOp << " to " << CommonVL->getImm() << " for " << MI << "\n"); diff --git a/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll b/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll new file mode 100644 index 00000000000000..65e6eddfb3cd60 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-enable-vl-optimizer \ +; RUN: -verify-machineinstrs -debug-only=riscv-vl-optimizer -o - 2>&1 %s | FileCheck %s + +; REQUIRES: asserts + +; GitHub Issue #123862 provided a case where the riscv-vl-optimizer pass was +; very slow. It was found that that case benefited greatly from aborting due +; to CommonVL == VLOp. Adding the case provided in the issue would show up +; as a long running test instead of a test failure. We would likley have a hard +; time figuring if that case had a regression. So instead, we check this output +; which was responsible for speeding it up. + +define @same_vl_imm( %passthru, %a, %b) { + ; CHECK: User VL is: 4 + ; CHECK-NEXT: Abort due to CommonVL == VLOp, no point in reducing. + %v = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %a, %b, i64 4) + %w = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %v, %a, i64 4) + ret %w +} + +define @same_vl_reg( %passthru, %a, %b, i64 %vl) { + ; CHECK: User VL is: %3:gprnox0 + ; CHECK-NEXT: Abort due to CommonVL == VLOp, no point in reducing. + %v = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %a, %b, i64 %vl) + %w = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %v, %a, i64 %vl) + ret %w +} From 5ede7b6a6bc22aee86e592835ccc4eaa9459e5cd Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Wed, 22 Jan 2025 14:18:07 -0500 Subject: [PATCH 031/208] Revert "Reapply "[Clang][Sema] Use the correct lookup context when building overloaded 'operator->' in the current instantiation (#104458)"" (#123982) Reverts llvm/llvm-project#109422 --- clang/include/clang/Sema/Sema.h | 5 ++- clang/lib/Sema/SemaExprCXX.cpp | 21 +++++++-- clang/lib/Sema/SemaExprMember.cpp | 2 +- clang/lib/Sema/SemaOverload.cpp | 18 ++------ clang/lib/Sema/TreeTransform.h | 5 +-- .../temp.res/temp.dep/temp.dep.type/p4.cpp | 45 +++++-------------- 6 files changed, 38 insertions(+), 58 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a2a47d535b8e06..9fa33d6ca76ba5 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10608,8 +10608,9 @@ class Sema final : public SemaBase { /// BuildOverloadedArrowExpr - Build a call to an overloaded @c operator-> /// (if one exists), where @c Base is an expression of class type and /// @c Member is the name of the member we're trying to find. - ExprResult BuildOverloadedArrowExpr(Expr *Base, SourceLocation OpLoc, - bool *NoArrowOperatorFound); + ExprResult BuildOverloadedArrowExpr(Scope *S, Expr *Base, + SourceLocation OpLoc, + bool *NoArrowOperatorFound = nullptr); ExprResult BuildCXXMemberCallExpr(Expr *Exp, NamedDecl *FoundDecl, CXXConversionDecl *Method, diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 0ebf5f54613926..1e39d69e8b230f 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -7999,6 +7999,18 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, QualType BaseType = Base->getType(); MayBePseudoDestructor = false; + if (BaseType->isDependentType()) { + // If we have a pointer to a dependent type and are using the -> operator, + // the object type is the type that the pointer points to. We might still + // have enough information about that type to do something useful. + if (OpKind == tok::arrow) + if (const PointerType *Ptr = BaseType->getAs()) + BaseType = Ptr->getPointeeType(); + + ObjectType = ParsedType::make(BaseType); + MayBePseudoDestructor = true; + return Base; + } // C++ [over.match.oper]p8: // [...] When operator->returns, the operator-> is applied to the value @@ -8013,7 +8025,7 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, SmallVector OperatorArrows; CTypes.insert(Context.getCanonicalType(BaseType)); - while (BaseType->getAsRecordDecl()) { + while (BaseType->isRecordType()) { if (OperatorArrows.size() >= getLangOpts().ArrowDepth) { Diag(OpLoc, diag::err_operator_arrow_depth_exceeded) << StartingType << getLangOpts().ArrowDepth << Base->getSourceRange(); @@ -8024,7 +8036,7 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, } Result = BuildOverloadedArrowExpr( - Base, OpLoc, + S, Base, OpLoc, // When in a template specialization and on the first loop iteration, // potentially give the default diagnostic (with the fixit in a // separate note) instead of having the error reported back to here @@ -8088,7 +8100,7 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, // it's legal for the type to be incomplete if this is a pseudo-destructor // call. We'll do more incomplete-type checks later in the lookup process, // so just skip this check for ObjC types. - if (BaseType->isDependentType() || !BaseType->isRecordType()) { + if (!BaseType->isRecordType()) { ObjectType = ParsedType::make(BaseType); MayBePseudoDestructor = true; return Base; @@ -8099,7 +8111,8 @@ ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, // Unlike the object expression in other contexts, *this is not required to // be of complete type for purposes of class member access (5.2.5) outside // the member function body. - if (!isThisOutsideMemberFunctionBody(BaseType) && + if (!BaseType->isDependentType() && + !isThisOutsideMemberFunctionBody(BaseType) && RequireCompleteType(OpLoc, BaseType, diag::err_incomplete_member_access)) { return CreateRecoveryExpr(Base->getBeginLoc(), Base->getEndLoc(), {Base}); diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index 8326a4db0a7719..d130e8b86bc56d 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -1357,7 +1357,7 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R, BaseType = Ptr->getPointeeType(); else if (BaseType->isFunctionType()) goto fail; - else if (BaseExpr.get()->isTypeDependent()) + else if (BaseType->isDependentType()) BaseType = S.Context.DependentTy; else if (BaseType->isRecordType()) { // Recover from arrow accesses to records, e.g.: diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index bf4c0288274ac7..23056ca5deba3c 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -15962,9 +15962,10 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj, return CheckForImmediateInvocation(MaybeBindToTemporary(TheCall), Method); } -ExprResult Sema::BuildOverloadedArrowExpr(Expr *Base, SourceLocation OpLoc, +ExprResult Sema::BuildOverloadedArrowExpr(Scope *S, Expr *Base, + SourceLocation OpLoc, bool *NoArrowOperatorFound) { - assert(Base->getType()->getAsRecordDecl() && + assert(Base->getType()->isRecordType() && "left-hand side must have class type"); if (checkPlaceholderForOverload(*this, Base)) @@ -15987,20 +15988,9 @@ ExprResult Sema::BuildOverloadedArrowExpr(Expr *Base, SourceLocation OpLoc, return ExprError(); LookupResult R(*this, OpName, OpLoc, LookupOrdinaryName); - LookupParsedName(R, /*S=*/nullptr, /*SS=*/nullptr, Base->getType()); + LookupQualifiedName(R, Base->getType()->castAs()->getDecl()); R.suppressAccessDiagnostics(); - if (Base->getType()->isDependentType() && - (!R.empty() || R.wasNotFoundInCurrentInstantiation())) { - DeclarationNameInfo OpNameInfo(OpName, OpLoc); - ExprResult Fn = CreateUnresolvedLookupExpr( - /*NamingClass=*/nullptr, /*NNSLoc=*/NestedNameSpecifierLoc(), - OpNameInfo, R.asUnresolvedSet(), /*PerformADL=*/false); - return CXXOperatorCallExpr::Create(Context, OO_Arrow, Fn.get(), Base, - Context.DependentTy, VK_PRValue, OpLoc, - CurFPFeatureOverrides()); - } - for (LookupResult::iterator Oper = R.begin(), OperEnd = R.end(); Oper != OperEnd; ++Oper) { AddMethodCandidate(Oper.getPair(), Base->getType(), Base->Classify(Context), diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index c0363692a3eb72..7dc88a1ae23b98 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -17282,11 +17282,10 @@ ExprResult TreeTransform::RebuildCXXOperatorCallExpr( } else if (Op == OO_Arrow) { // It is possible that the type refers to a RecoveryExpr created earlier // in the tree transformation. - if (First->containsErrors()) + if (First->getType()->isDependentType()) return ExprError(); // -> is never a builtin operation. - return getSema().BuildOverloadedArrowExpr(First, OpLoc, - /*NoArrowOperatorFound=*/nullptr); + return SemaRef.BuildOverloadedArrowExpr(nullptr, First, OpLoc); } else if (Second == nullptr || isPostIncDec) { if (!First->getType()->isOverloadableType() || (Op == OO_Amp && getSema().isQualifiedMemberAccess(First))) { diff --git a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp index 03eda1f13feed7..f32f49ef4539a5 100644 --- a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp +++ b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp @@ -484,19 +484,16 @@ namespace N4 { template struct A { void not_instantiated(A a, A b, T c) { - a->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} - b->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + a->x; + b->x; c->x; } void instantiated(A a, A b, T c) { - // FIXME: We should only emit a single diagnostic suggesting to use '.'! - a->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} - // expected-error@-1 {{member reference type 'A' is not a pointer; did you mean to use '.'?}} - // expected-error@-2 {{no member named 'x' in 'N4::A'}} - b->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} - // expected-error@-1 {{member reference type 'A' is not a pointer; did you mean to use '.'?}} - // expected-error@-2 {{no member named 'x' in 'N4::A'}} + a->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + // expected-error@-1 {{no member named 'x' in 'N4::A'}} + b->x; // expected-error {{member reference type 'A' is not a pointer; did you mean to use '.'?}} + // expected-error@-1 {{no member named 'x' in 'N4::A'}} c->x; // expected-error {{member reference type 'int' is not a pointer}} } }; @@ -543,10 +540,11 @@ namespace N4 { a->T::f(); a->T::g(); - a->U::x; - a->U::y; - a->U::f(); - a->U::g(); + // FIXME: 'U' should be a dependent name, and its lookup context should be 'a.operator->()'! + a->U::x; // expected-error {{use of undeclared identifier 'U'}} + a->U::y; // expected-error {{use of undeclared identifier 'U'}} + a->U::f(); // expected-error {{use of undeclared identifier 'U'}} + a->U::g(); // expected-error {{use of undeclared identifier 'U'}} } void instantiated(D a) { @@ -607,24 +605,3 @@ namespace N5 { template void g(int); // expected-note {{in instantiation of}} } // namespace N5 - -namespace N6 { - struct A { - int x; - }; - - struct B { - A* operator->(); - }; - - struct C { - B y; - }; - - template - struct D : C { - void f() { - y->x; - } - }; -} // namespace N6 From 7ad8a3da4771ce8abbd146611124104d42a4e63e Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 22 Jan 2025 11:35:52 -0800 Subject: [PATCH 032/208] [MemProf] Simplify edge iterations (NFC) (#123469) Remove edge iterator parameters from the various helpers that move edges onto other nodes, and their associated iterator update code, and instead iterate over copies of the edge lists in the caller loops. This also avoids the need to increment these iterators at every early loop continue. This simplifies the code, makes it less error prone when updating, and in particular, facilitates adding handling of recursive contexts. There were no measurable compile time and memory overhead effects for a large target. --- .../IPO/MemProfContextDisambiguation.cpp | 116 +++++++----------- 1 file changed, 45 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index f027c952f8cdc5..1966ce29083716 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -692,32 +692,28 @@ class CallsiteContextGraph { /// Create a clone of Edge's callee and move Edge to that new callee node, /// performing the necessary context id and allocation type updates. - /// If callee's caller edge iterator is supplied, it is updated when removing - /// the edge from that list. If ContextIdsToMove is non-empty, only that - /// subset of Edge's ids are moved to an edge to the new callee. + /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are + /// moved to an edge to the new callee. ContextNode * moveEdgeToNewCalleeClone(const std::shared_ptr &Edge, - EdgeIter *CallerEdgeI = nullptr, DenseSet ContextIdsToMove = {}); /// Change the callee of Edge to existing callee clone NewCallee, performing /// the necessary context id and allocation type updates. - /// If callee's caller edge iterator is supplied, it is updated when removing - /// the edge from that list. If ContextIdsToMove is non-empty, only that - /// subset of Edge's ids are moved to an edge to the new callee. + /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are + /// moved to an edge to the new callee. void moveEdgeToExistingCalleeClone(const std::shared_ptr &Edge, ContextNode *NewCallee, - EdgeIter *CallerEdgeI = nullptr, bool NewClone = false, DenseSet ContextIdsToMove = {}); /// Change the caller of the edge at the given callee edge iterator to be /// NewCaller, performing the necessary context id and allocation type - /// updates. The iterator is updated as the edge is removed from the list of - /// callee edges in the original caller. This is similar to the above - /// moveEdgeToExistingCalleeClone, but a simplified version of it as we always - /// move the given edge and all of its context ids. - void moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller); + /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but + /// a simplified version of it as we always move the given edge and all of its + /// context ids. + void moveCalleeEdgeToNewCaller(const std::shared_ptr &Edge, + ContextNode *NewCaller); /// Recursively perform cloning on the graph for the given Node and its /// callers, in order to uniquely identify the allocation behavior of an @@ -2313,12 +2309,13 @@ bool CallsiteContextGraph::partitionCallsByCallee( // Track whether we already assigned original node to a callee. bool UsedOrigNode = false; assert(NodeToCallingFunc[Node]); - for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { - auto Edge = *EI; - if (!Edge->Callee->hasCall()) { - ++EI; + // Iterate over a copy of Node's callee edges, since we may need to remove + // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and + // makes it less error-prone. + auto CalleeEdges = Node->CalleeEdges; + for (auto &Edge : CalleeEdges) { + if (!Edge->Callee->hasCall()) continue; - } // Will be updated below to point to whatever (caller) node this callee edge // should be moved to. @@ -2361,12 +2358,10 @@ bool CallsiteContextGraph::partitionCallsByCallee( } // Don't need to move edge if we are using the original node; - if (CallerNodeToUse == Node) { - ++EI; + if (CallerNodeToUse == Node) continue; - } - moveCalleeEdgeToNewCaller(EI, CallerNodeToUse); + moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse); } // Now that we are done moving edges, clean up any caller edges that ended // up with no type or context ids. During moveCalleeEdgeToNewCaller all @@ -3046,7 +3041,7 @@ void CallsiteContextGraph::exportToDot( template typename CallsiteContextGraph::ContextNode * CallsiteContextGraph::moveEdgeToNewCalleeClone( - const std::shared_ptr &Edge, EdgeIter *CallerEdgeI, + const std::shared_ptr &Edge, DenseSet ContextIdsToMove) { ContextNode *Node = Edge->Callee; assert(NodeToCallingFunc.count(Node)); @@ -3054,7 +3049,7 @@ CallsiteContextGraph::moveEdgeToNewCalleeClone( createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call); Node->addClone(Clone); Clone->MatchingCalls = Node->MatchingCalls; - moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true, + moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true, ContextIdsToMove); return Clone; } @@ -3062,8 +3057,7 @@ CallsiteContextGraph::moveEdgeToNewCalleeClone( template void CallsiteContextGraph:: moveEdgeToExistingCalleeClone(const std::shared_ptr &Edge, - ContextNode *NewCallee, EdgeIter *CallerEdgeI, - bool NewClone, + ContextNode *NewCallee, bool NewClone, DenseSet ContextIdsToMove) { // NewCallee and Edge's current callee must be clones of the same original // node (Edge's current callee may be the original node too). @@ -3094,23 +3088,18 @@ void CallsiteContextGraph:: ContextIdsToMove.end()); ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes; assert(Edge->ContextIds == ContextIdsToMove); - removeEdgeFromGraph(Edge.get(), CallerEdgeI, /*CalleeIter=*/false); + removeEdgeFromGraph(Edge.get()); } else { // Otherwise just reconnect Edge to NewCallee. Edge->Callee = NewCallee; NewCallee->CallerEdges.push_back(Edge); // Remove it from callee where it was previously connected. - if (CallerEdgeI) - *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); - else - OldCallee->eraseCallerEdge(Edge.get()); + OldCallee->eraseCallerEdge(Edge.get()); // Don't need to update Edge's context ids since we are simply // reconnecting it. } } else { // Only moving a subset of Edge's ids. - if (CallerEdgeI) - ++(*CallerEdgeI); // Compute the alloc type of the subset of ids being moved. auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove); if (ExistingEdgeToNewCallee) { @@ -3183,16 +3172,16 @@ void CallsiteContextGraph:: template void CallsiteContextGraph:: - moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller) { - auto Edge = *CalleeEdgeI; + moveCalleeEdgeToNewCaller(const std::shared_ptr &Edge, + ContextNode *NewCaller) { ContextNode *OldCaller = Edge->Caller; + OldCaller->eraseCalleeEdge(Edge.get()); // We might already have an edge to the new caller. If one exists we will // reuse it. auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee); - CalleeEdgeI = OldCaller->CalleeEdges.erase(CalleeEdgeI); if (ExistingEdgeToNewCaller) { // Since we already have an edge to NewCaller, simply move the ids // onto it, and remove the existing Edge. @@ -3417,11 +3406,11 @@ void CallsiteContextGraph::identifyClones( // Iterate until we find no more opportunities for disambiguating the alloc // types via cloning. In most cases this loop will terminate once the Node // has a single allocation type, in which case no more cloning is needed. - // We need to be able to remove Edge from CallerEdges, so need to adjust - // iterator inside the loop. - for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { - auto CallerEdge = *EI; - + // Iterate over a copy of Node's caller edges, since we may need to remove + // edges in the moveEdgeTo* methods, and this simplifies the handling and + // makes it less error-prone. + auto CallerEdges = Node->CallerEdges; + for (auto &CallerEdge : CallerEdges) { // See if cloning the prior caller edge left this node with a single alloc // type or a single caller. In that case no more cloning of Node is needed. if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) @@ -3429,10 +3418,8 @@ void CallsiteContextGraph::identifyClones( // If the caller was not successfully matched to a call in the IR/summary, // there is no point in trying to clone for it as we can't update that call. - if (!CallerEdge->Caller->hasCall()) { - ++EI; + if (!CallerEdge->Caller->hasCall()) continue; - } // Only need to process the ids along this edge pertaining to the given // allocation. @@ -3441,10 +3428,9 @@ void CallsiteContextGraph::identifyClones( if (!RecursiveContextIds.empty()) CallerEdgeContextsForAlloc = set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds); - if (CallerEdgeContextsForAlloc.empty()) { - ++EI; + if (CallerEdgeContextsForAlloc.empty()) continue; - } + auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc); // Compute the node callee edge alloc types corresponding to the context ids @@ -3471,10 +3457,8 @@ void CallsiteContextGraph::identifyClones( if (allocTypeToUse(CallerAllocTypeForAlloc) == allocTypeToUse(Node->AllocTypes) && allocTypesMatch( - CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { - ++EI; + CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) continue; - } // First see if we can use an existing clone. Check each clone and its // callee edges for matching alloc types. @@ -3504,14 +3488,11 @@ void CallsiteContextGraph::identifyClones( // The edge iterator is adjusted when we move the CallerEdge to the clone. if (Clone) - moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false, + moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false, CallerEdgeContextsForAlloc); else - Clone = - moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc); + Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc); - assert(EI == Node->CallerEdges.end() || - Node->AllocTypes != (uint8_t)AllocationType::None); // Sanity check that no alloc types on clone or its edges are None. assert(Clone->AllocTypes != (uint8_t)AllocationType::None); } @@ -3952,16 +3933,14 @@ bool CallsiteContextGraph::assignFunctions() { // assign this clone to. std::map FuncCloneToNewCallsiteCloneMap; FuncInfo FuncCloneAssignedToCurCallsiteClone; - // We need to be able to remove Edge from CallerEdges, so need to adjust - // iterator in the loop. - for (auto EI = Clone->CallerEdges.begin(); - EI != Clone->CallerEdges.end();) { - auto Edge = *EI; + // Iterate over a copy of Clone's caller edges, since we may need to + // remove edges in the moveEdgeTo* methods, and this simplifies the + // handling and makes it less error-prone. + auto CloneCallerEdges = Clone->CallerEdges; + for (auto &Edge : CloneCallerEdges) { // Ignore any caller that does not have a recorded callsite Call. - if (!Edge->Caller->hasCall()) { - EI++; + if (!Edge->Caller->hasCall()) continue; - } // If this caller already assigned to call a version of OrigFunc, need // to ensure we can assign this callsite clone to that function clone. if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { @@ -4006,27 +3985,24 @@ bool CallsiteContextGraph::assignFunctions() { FuncCloneCalledByCaller)) { ContextNode *NewClone = FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; - moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); + moveEdgeToExistingCalleeClone(Edge, NewClone); // Cleanup any none type edges cloned over. removeNoneTypeCalleeEdges(NewClone); } else { // Create a new callsite clone. - ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); + ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge); removeNoneTypeCalleeEdges(NewClone); FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = NewClone; // Add to list of clones and process later. ClonesWorklist.push_back(NewClone); - assert(EI == Clone->CallerEdges.end() || - Clone->AllocTypes != (uint8_t)AllocationType::None); assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); } // Moving the caller edge may have resulted in some none type // callee edges. removeNoneTypeCalleeEdges(Clone); // We will handle the newly created callsite clone in a subsequent - // iteration over this Node's Clones. Continue here since we - // already adjusted iterator EI while moving the edge. + // iteration over this Node's Clones. continue; } @@ -4074,8 +4050,6 @@ bool CallsiteContextGraph::assignFunctions() { RecordCalleeFuncOfCallsite(Edge->Caller, FuncCloneAssignedToCurCallsiteClone); } - - EI++; } } if (VerifyCCG) { From 3057d0f14af5e073be3b7c7942dfff2a975ac4cb Mon Sep 17 00:00:00 2001 From: AdityaK Date: Wed, 22 Jan 2025 12:00:56 -0800 Subject: [PATCH 033/208] Android defaults to pic (#123955) --- clang/lib/Driver/ToolChains/CommonArgs.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 6d94b964d50b01..c045069c34424f 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1863,14 +1863,6 @@ tools::ParsePICArgs(const ToolChain &ToolChain, const ArgList &Args) { // Android-specific defaults for PIC/PIE if (Triple.isAndroid()) { switch (Triple.getArch()) { - case llvm::Triple::aarch64: - case llvm::Triple::arm: - case llvm::Triple::armeb: - case llvm::Triple::thumb: - case llvm::Triple::thumbeb: - PIC = true; // "-fpic" - break; - case llvm::Triple::x86: case llvm::Triple::x86_64: PIC = true; // "-fPIC" @@ -1878,6 +1870,7 @@ tools::ParsePICArgs(const ToolChain &ToolChain, const ArgList &Args) { break; default: + PIC = true; // "-fpic" break; } } From 7e622b61320543b3706711609f1f32fd9ea3788d Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Wed, 22 Jan 2025 12:36:48 -0800 Subject: [PATCH 034/208] [TOSA] Change PadOp padding to tosa.shape (#123133) This patch changes PadOp's padding input to type !tosa.shape<2 * rank>, (where rank is the rank of the PadOp's input), instead of a tensor. This patch is also a part of TOSA v1.0 effort: https://discourse.llvm.org/t/rfc-tosa-dialect-increment-to-v1-0/83708 This patch updates the PadOp to match all against the TOSA v1.0 form. Original Authors include: @Tai78641 @wonjeon Co-authored-by: Tai Ly --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 10 ++-- .../mlir/Dialect/Tosa/Utils/ConversionUtils.h | 8 +++ .../Conversion/TosaToTensor/TosaToTensor.cpp | 27 +++++----- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 50 ++++++++---------- .../Tosa/Transforms/TosaDecomposeConv2D.cpp | 6 +-- .../Transforms/TosaDecomposeDepthwise.cpp | 6 +-- .../Transforms/TosaDecomposeTransposeConv.cpp | 29 ++++------- .../Dialect/Tosa/Utils/ConversionUtils.cpp | 33 ++++++++++++ .../TosaToTensor/tosa-to-tensor.mlir | 51 +++++++++++++------ mlir/test/Dialect/Tosa/canonicalize.mlir | 45 ++++++++-------- mlir/test/Dialect/Tosa/invalid.mlir | 38 +++++++------- mlir/test/Dialect/Tosa/ops.mlir | 10 ++-- .../Dialect/Tosa/tosa-decompose-conv2d.mlir | 4 +- .../Tosa/tosa-decompose-depthwise.mlir | 4 +- .../Tosa/tosa-decompose-transpose-conv.mlir | 14 ++--- mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 20 +++----- 16 files changed, 196 insertions(+), 159 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index e1efa7a3001b9f..2953e006bbe8d1 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1557,21 +1557,21 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> { Example: ```mlir - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> - tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<4xi32>) -> (tensor<4x9xf32>) + %0 = tosa.const_shape { value = dense<[1, 2, 3, 4]> : tensor<4xindex> } : () -> !tosa.shape<4> + tosa.pad %arg0, %0 : (tensor<1x2xf32>, !tosa.shape<4>) -> (tensor<4x9xf32>) ``` Example 2: ```mlir - %0 = arith.constant dense<[-1, 2, 3, 4]> : tensor<4xi32> - tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<4xi32>) -> (tensor) + %0 = tosa.const_shape { value = dense<[-1, 2, 3, 4]> : tensor<4xindex> } : () -> !tosa.shape<4> + tosa.pad %arg0, %0 : (tensor<1x2xf32>, !tosa.shape<4>) -> (tensor) ``` }]; let arguments = (ins Tosa_RankedTensor:$input1, - TosaTensorRankOf<[Tosa_Int32Or64], [1]>:$padding, + Tosa_Shape:$padding, Optional:$pad_const, OptionalAttr:$quantization_info ); diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h index 90fea1f68beb58..78a8828855437e 100644 --- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h +++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h @@ -229,6 +229,14 @@ SmallVector applyTOSAPermutation(ArrayRef input, return permuted; } +// Computes shape value using tosa const_shape op. +Value getTosaConstShape(PatternRewriter &rewriter, Location loc, + llvm::ArrayRef shape); +SmallVector convertFromMlirShape(ArrayRef shape); + +bool getConstShapeValue(Operation *op, + llvm::SmallVector &result_shape); + } // namespace tosa } // namespace mlir diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp index b5a0da15e780e0..5aa0269a675cbe 100644 --- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp +++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp @@ -306,7 +306,16 @@ class PadConverter : public OpConversionPattern { ConversionPatternRewriter &rewriter) const final { auto loc = padOp.getLoc(); auto input = padOp.getInput1(); - auto padding = padOp.getPadding(); + + ElementsAttr paddingElems; + if (!matchPattern(padOp.getPadding(), m_Constant(&paddingElems))) { + return rewriter.notifyMatchFailure( + padOp, "padding must be a static shape value"); + } + llvm::SmallVector paddingVals; + for (auto idx : paddingElems.getValues()) { + paddingVals.push_back(static_cast(idx.getInt())); + } ShapedType inputTy = cast(input.getType()); Type elementTy = inputTy.getElementType(); @@ -345,18 +354,10 @@ class PadConverter : public OpConversionPattern { highValues.reserve(rank); for (int i = 0; i < rank; i++) { - Value lowIndex = rewriter.create(loc, 2 * i); - Value highIndex = rewriter.create(loc, 2 * i + 1); - Value lowVal = rewriter.createOrFold( - loc, padding, ValueRange({lowIndex})); - Value highVal = rewriter.createOrFold( - loc, padding, ValueRange({highIndex})); - - lowVal = rewriter.createOrFold( - loc, rewriter.getIndexType(), lowVal); - highVal = rewriter.createOrFold( - loc, rewriter.getIndexType(), highVal); - + Value lowVal = rewriter.create( + loc, rewriter.getIndexAttr(paddingVals[2 * i])); + Value highVal = rewriter.create( + loc, rewriter.getIndexAttr(paddingVals[2 * i + 1])); lowValues.push_back(lowVal); highValues.push_back(highVal); } diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index de5ff61b5848e3..fdccce60fe1d86 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -36,6 +36,7 @@ using namespace mlir; using namespace mlir::tosa; #include "mlir/Dialect/Tosa/IR/TosaOpsDialect.cpp.inc" +#include "mlir/Dialect/Tosa/Utils/ConversionUtils.h" //===----------------------------------------------------------------------===// // Tosa dialect interface includes. @@ -822,51 +823,42 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents( PadOp::Adaptor adaptor, SmallVectorImpl &inferredReturnShapes) { ShapeAdaptor inputShape(adaptor.getInput1().getType()); - ShapeAdaptor paddingShape(adaptor.getPadding().getType()); + auto paddingRank = + cast(adaptor.getPadding().getType()).getRank(); SmallVector outputShape; - // If both inputs have unknown shape, we cannot determine the shape of the - // output. - if (!inputShape.hasRank() && !paddingShape.hasRank()) { - inferredReturnShapes.push_back(ShapedTypeComponents()); - return success(); - } - - // If the input rank is unknown we can info the output rank using the - // padding shape's first dim. + // If the input rank is unknown, we can infer the output rank using the + // padding shape's rank divided by 2. if (!inputShape.hasRank()) { - if (paddingShape.isDynamicDim(0)) { - inferredReturnShapes.push_back(ShapedTypeComponents()); - return success(); - } - - outputShape.resize(paddingShape.getDimSize(0) / 2, ShapedType::kDynamic); + outputShape.resize(paddingRank / 2, ShapedType::kDynamic); inferredReturnShapes.push_back(ShapedTypeComponents(outputShape)); return success(); } - DenseIntElementsAttr paddings; + SmallVector paddingValues; // If the paddings value is not a constant, all dimensions must be dynamic. - if (!matchPattern(adaptor.getPadding(), m_Constant(&paddings))) { + if (!tosa::getConstShapeValue(adaptor.getPadding().getDefiningOp(), + paddingValues)) { outputShape.resize(inputShape.getRank(), ShapedType::kDynamic); inferredReturnShapes.push_back(ShapedTypeComponents(outputShape)); return success(); } - SmallVector paddingValues; - for (auto val : paddings) { - paddingValues.push_back(val.getSExtValue()); - } - outputShape.reserve(inputShape.getRank()); for (int i = 0, s = inputShape.getRank(); i < s; i++) { if (inputShape.isDynamicDim(i)) { outputShape.push_back(ShapedType::kDynamic); continue; } + auto padFront = paddingValues[i * 2]; + auto padBack = paddingValues[i * 2 + 1]; + if (padFront < 0 || padBack < 0) { + // if either padding for dim i is -1, output dim is unknown + outputShape.push_back(ShapedType::kDynamic); + continue; + } - outputShape.push_back(inputShape.getDimSize(i) + paddingValues[i * 2] + - paddingValues[i * 2 + 1]); + outputShape.push_back(inputShape.getDimSize(i) + padFront + padBack); } inferredReturnShapes.push_back(ShapedTypeComponents(outputShape)); @@ -876,17 +868,15 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents( LogicalResult tosa::PadOp::verify() { RankedTensorType inputType = getInput1().getType(); RankedTensorType outputType = getOutput().getType(); - RankedTensorType paddingType = getPadding().getType(); + auto paddingRank = cast(getPadding().getType()).getRank(); if (inputType.getRank() != outputType.getRank()) return emitOpError() << "expect same input and output tensor rank."; - if (!paddingType.isDynamicDim(0) && - paddingType.getDimSize(0) != inputType.getRank() * 2) + if (paddingRank != inputType.getRank() * 2) return emitOpError() << "expected padding tensor dim 0 to have size " << inputType.getRank() * 2 - << " (2*rank(shape1)) but got size " - << paddingType.getDimSize(0); + << " (2*rank(shape1)) but got size " << paddingRank; return success(); } diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp index 04a709c5967795..cb08360f902286 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp @@ -81,11 +81,7 @@ struct Conv2DIsFullyConnected : public OpRewritePattern { } } - auto padSizeTy = RankedTensorType::get({8}, rewriter.getI64Type()); - auto padSize = - DenseIntElementsAttr::get(padSizeTy, ArrayRef(pad)); - Value padSizeVal = - rewriter.create(op->getLoc(), padSizeTy, padSize); + Value padSizeVal = getTosaConstShape(rewriter, op->getLoc(), pad); auto padTy = RankedTensorType::get({}, inputETy); auto padAttr = DenseElementsAttr::get(padTy, zeroAttr); diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp index 14f392ab8c45c1..45f4419875b485 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp @@ -108,11 +108,7 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern { } } - auto padSizeTy = RankedTensorType::get({10}, rewriter.getI64Type()); - auto padSize = - DenseIntElementsAttr::get(padSizeTy, ArrayRef(pad)); - Value padSizeVal = - rewriter.create(op->getLoc(), padSizeTy, padSize); + Value padSizeVal = getTosaConstShape(rewriter, op->getLoc(), pad); auto padTy = RankedTensorType::get({}, inputETy); auto padAttr = DenseElementsAttr::get(padTy, zeroAttr); diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp index db1e219b601b30..1b97f0b245d9ba 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp @@ -135,15 +135,14 @@ class TransposeConvStridedConverter int64_t inputChannels = weightTy.getDimSize(3); // Pad the weight so that it is modulo of the striding. - llvm::SmallVector weightPadding = {0, 0, 0, 0, 0, 0, 0, 0}; + llvm::SmallVector weightPadding = {0, 0, 0, 0, 0, 0, 0, 0}; weightPadding[3] = (weightHeight % stride[0]) ? (stride[0] - weightHeight % stride[0]) : 0; weightPadding[5] = - (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0; - DenseElementsAttr weightPaddingAttr = DenseIntElementsAttr::get( - RankedTensorType::get({8}, rewriter.getI32Type()), weightPadding); - Value weightPaddingVal = CreateOpAndInferShape( - rewriter, loc, weightPaddingAttr.getType(), weightPaddingAttr); + weightWidth % stride[1] ? stride[1] - weightWidth % stride[1] : 0; + + Value weightPaddingVal = + getTosaConstShape(rewriter, op->getLoc(), weightPadding); if (op.getQuantizationInfo().has_value()) { auto quantInfo = op.getQuantizationInfo().value(); @@ -197,17 +196,14 @@ class TransposeConvStridedConverter /* axis = */ rewriter.getI32IntegerAttr(2)); // We need to pad the input far enough that we can pull all values. - llvm::SmallVector inputPadding = {0, 0, 0, 0, 0, 0, 0, 0}; + llvm::SmallVector inputPadding = {0, 0, 0, 0, 0, 0, 0, 0}; inputPadding[2] += restridedWeightTy.getDimSize(1) - 1; inputPadding[3] += restridedWeightTy.getDimSize(1) - 1; inputPadding[4] += restridedWeightTy.getDimSize(2) - 1; inputPadding[5] += restridedWeightTy.getDimSize(2) - 1; - DenseElementsAttr inputPaddingAttr = DenseIntElementsAttr::get( - RankedTensorType::get({8}, rewriter.getI32Type()), inputPadding); - - Value inputPaddingVal = CreateOpAndInferShape( - rewriter, loc, inputPaddingAttr.getType(), inputPaddingAttr); + Value inputPaddingVal = + getTosaConstShape(rewriter, op->getLoc(), inputPadding); if (op.getQuantizationInfo().has_value()) { auto quantInfo = op.getQuantizationInfo().value(); @@ -310,17 +306,14 @@ class TransposeConvStridedConverter rewriter.getDenseI64ArrayAttr(sliceSize)) .getResult(); - llvm::SmallVector resultPadding = {0, 0, 0, 0, 0, 0, 0, 0}; + llvm::SmallVector resultPadding = {0, 0, 0, 0, 0, 0, 0, 0}; resultPadding[2] = resultPadTop; resultPadding[3] = resultTy.getDimSize(1) - resultPadTop - sliceSize[1]; resultPadding[4] = resultPadLeft; resultPadding[5] = resultTy.getDimSize(2) - resultPadLeft - sliceSize[2]; - DenseElementsAttr resultPaddingAttr = DenseIntElementsAttr::get( - RankedTensorType::get({8}, rewriter.getI32Type()), resultPadding); - - Value resultPaddingVal = CreateOpAndInferShape( - rewriter, loc, resultPaddingAttr.getType(), resultPaddingAttr); + Value resultPaddingVal = + getTosaConstShape(rewriter, op->getLoc(), resultPadding); Value resultPad = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(resultETy), slice, diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp index 1f6e3b2ab83919..62b0bc1857e395 100644 --- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp +++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp @@ -160,3 +160,36 @@ LogicalResult mlir::tosa::EqualizeRanks(ImplicitLocOpBuilder &builder, return success(); } + +Value mlir::tosa::getTosaConstShape(PatternRewriter &rewriter, Location loc, + llvm::ArrayRef shape) { + auto attr = rewriter.getIndexTensorAttr(shape); + auto type = mlir::tosa::shapeType::get(rewriter.getContext(), shape.size()); + mlir::Operation *mlir_op = + rewriter.create(loc, type, attr); + return mlir_op->getResult(0); +} + +SmallVector mlir::tosa::convertFromMlirShape(ArrayRef shape) { + return to_vector(llvm::map_range(shape, [](int64_t dim) { + return ShapedType::isDynamic(dim) ? -1 : dim; + })); +} + +bool mlir::tosa::getConstShapeValue(Operation *op, + llvm::SmallVector &result_shape) { + if (!op) { + return false; + } + if (auto constOp = mlir::dyn_cast(op)) { + Attribute constOpAttr = constOp->getAttr("value"); + DenseElementsAttr elementsAttr = cast(constOpAttr); + for (int i = 0; i < elementsAttr.size(); i++) { + int64_t val = elementsAttr.getValues()[i]; + result_shape.push_back(val); + } + return true; + } + // for undefined op, return false. + return false; +} diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir index 0b9a64494bc0f1..2f11b31aad2307 100644 --- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir +++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir @@ -459,65 +459,84 @@ func.func @slice_dyn(%arg0: tensor) -> (tensor) { // CHECK-LABEL: @pad_float // CHECK-SAME: (%[[ARG0:[0-9a-zA-Z_]*]]: func.func @pad_float(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) { - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> + %0 = tosa.const_shape {value = dense<[1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> + // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index + // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index + // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index + // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32 - // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] { + // CHECK: tensor.pad %[[ARG0]] low{{\[}}[[INDEX1]], [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]] { // CHECK: tensor.yield [[CST]] // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32> - %1 = "tosa.pad"(%arg0, %0) : (tensor<1x2xf32>, tensor<4xi32>) -> (tensor<4x9xf32>) + %1 = "tosa.pad"(%arg0, %0) : (tensor<1x2xf32>, !tosa.shape<4>) -> (tensor<4x9xf32>) return %1 : tensor<4x9xf32> } +// ----- func.func @pad_int(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) { - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> + %0 = tosa.const_shape {value = dense<[1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> // CHECK: [[CST:%.+]] = arith.constant 0 : i32 // CHECK: tensor.pad // CHECK: tensor.yield [[CST]] - %1 = "tosa.pad"(%arg0, %0) : (tensor<1x2xi32>, tensor<4xi32>) -> (tensor<4x9xi32>) + %1 = "tosa.pad"(%arg0, %0) : (tensor<1x2xi32>, !tosa.shape<4>) -> (tensor<4x9xi32>) return %1 : tensor<4x9xi32> } +// ----- func.func @pad_quant(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) { - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> + %0 = tosa.const_shape {value = dense<[1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> // CHECK: [[CST:%.+]] = arith.constant 42 : i32 // CHECK: tensor.pad // CHECK: tensor.yield [[CST]] - %1 = "tosa.pad"(%arg0, %0) {quantization_info = #tosa.pad_quant} : (tensor<1x2xi32>, tensor<4xi32>) -> (tensor<4x9xi32>) + %1 = "tosa.pad"(%arg0, %0) {quantization_info = #tosa.pad_quant} : (tensor<1x2xi32>, !tosa.shape<4>) -> (tensor<4x9xi32>) return %1 : tensor<4x9xi32> } // ----- func.func @pad_float_explicit(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) { - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> + %0 = tosa.const_shape {value = dense<[1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> + // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index + // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index + // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index + // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index // CHECK-DAG: [[CST:%.+]] = arith.constant 4.200000e+01 : f32 - // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] { + // CHECK: tensor.pad %[[ARG0]] low{{\[}}[[INDEX1]], [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]] { // CHECK: tensor.yield [[CST]] // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32> %1 = arith.constant dense<42.0> : tensor - %2 = "tosa.pad"(%arg0, %0, %1) : (tensor<1x2xf32>, tensor<4xi32>, tensor) -> (tensor<4x9xf32>) + %2 = "tosa.pad"(%arg0, %0, %1) : (tensor<1x2xf32>, !tosa.shape<4>, tensor) -> (tensor<4x9xf32>) return %2 : tensor<4x9xf32> } // ----- func.func @pad_dyn_input(%arg0 : tensor) -> (tensor) { - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> + %0 = tosa.const_shape {value = dense<[1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> + // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index + // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index + // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index + // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32 - // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] { + // CHECK: tensor.pad %[[ARG0]] low{{\[}}[[INDEX1]], [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]] { // CHECK: tensor.yield [[CST]] // CHECK: } : tensor to tensor - %1 = "tosa.pad"(%arg0, %0) : (tensor, tensor<4xi32>) -> (tensor) + %1 = "tosa.pad"(%arg0, %0) : (tensor, !tosa.shape<4>) -> (tensor) return %1 : tensor } +// ----- func.func @pad_dyn_padding(%arg0 : tensor<1x2xf32>) -> (tensor) { - %0 = arith.constant dense<[-1, 2, 3, 4]> : tensor<4xi32> + %0 = tosa.const_shape {value = dense<[-1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> + // CHECK-DAG: [[INDEX1:%.+]] = arith.constant -1 : index + // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index + // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index + // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32 - // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] { + // CHECK: tensor.pad %[[ARG0]] low{{\[}}[[INDEX1]], [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]] { // CHECK: tensor.yield [[CST]] // CHECK: } : tensor<1x2xf32> to tensor - %1 = "tosa.pad"(%arg0, %0) : (tensor<1x2xf32>, tensor<4xi32>) -> (tensor) + %1 = "tosa.pad"(%arg0, %0) : (tensor<1x2xf32>, !tosa.shape<4>) -> (tensor) return %1 : tensor } diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index 889e2eda9e5b84..e394188e9a9311 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -210,8 +210,8 @@ func.func @max_pool2d_is_noop(%arg0: tensor<10x1x1x3xf32>) -> tensor<10x1x1x3xf3 // CHECK-LABEL: @pad_noop func.func @pad_noop(%arg0: tensor) -> tensor { // CHECK: return %arg0 - %0 = "tosa.const"() { value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32> - %1 = tosa.pad %arg0, %0 : (tensor, tensor<4xi32>) -> tensor + %0 = tosa.const_shape { value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4> + %1 = tosa.pad %arg0, %0 : (tensor, !tosa.shape<4>) -> tensor return %1 : tensor } @@ -221,8 +221,8 @@ func.func @pad_noop(%arg0: tensor) -> tensor { func.func @pad_noop_padding_mismatch_nofold(%arg0: tensor) -> tensor { // CHECK: %[[PAD:.+]] = tosa.pad // CHECK: return %[[PAD]] - %0 = "tosa.const"() { value = dense_resource<__elided__> : tensor<4xi32>} : () -> tensor<4xi32> - %1 = tosa.pad %arg0, %0 : (tensor, tensor<4xi32>) -> tensor + %shape = tosa.const_shape { value = dense<[1, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> + %1 = tosa.pad %arg0, %shape : (tensor, !tosa.shape<4>) -> tensor return %1 : tensor } @@ -232,41 +232,44 @@ func.func @pad_noop_padding_mismatch_nofold(%arg0: tensor) -> tensor) -> tensor { // CHECK: %[[PAD:.+]] = tosa.pad // CHECK: return %[[PAD]] - - %c0_i32 = arith.constant 0 : i32 - %shape = tensor.from_elements %c0_i32, %c0_i32 : tensor<2xi32> - - %0 = tosa.pad %arg0, %shape : (tensor<10xf32>, tensor<2xi32>) -> tensor + %shape = tosa.const_shape { value = dense<[1, 2]> : tensor<2xindex>} : () -> !tosa.shape<2> + %0 = tosa.pad %arg0, %shape : (tensor<10xf32>, !tosa.shape<2>) -> tensor return %0 : tensor } // ----- // CHECK-LABEL: @pad_determine_val_i32 -func.func @pad_determine_val_i32(%arg0: tensor, %arg1 : tensor<4xi32>) -> tensor { - // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor} - // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]] - %1 = tosa.pad %arg0, %arg1 : (tensor, tensor<4xi32>) -> tensor +func.func @pad_determine_val_i32(%arg0: tensor, %arg1 : tensor<2x2xi32>) -> tensor { + // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor} + // CHECK-DAG: %[[PADDING:.+]] = tosa.const_shape {value = dense<[1, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> + // CHECK: tosa.pad %arg0, %[[PADDING]], %[[ZERO]] + %0 = tosa.const_shape { value = dense<[1, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> + %1 = tosa.pad %arg0, %0 : (tensor, !tosa.shape<4>) -> tensor return %1 : tensor } // ----- // CHECK-LABEL: @pad_determine_val_f32 -func.func @pad_determine_val_f32(%arg0: tensor, %arg1 : tensor<4xi32>) -> tensor { - // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor} - // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]] - %1 = tosa.pad %arg0, %arg1 : (tensor, tensor<4xi32>) -> tensor +func.func @pad_determine_val_f32(%arg0: tensor, %arg1 : tensor<2x2xi32>) -> tensor { + // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor} + // CHECK-DAG: %[[PADDING:.+]] = tosa.const_shape {value = dense<[1, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> + // CHECK: tosa.pad %arg0, %[[PADDING]], %[[ZERO]] + %0 = tosa.const_shape { value = dense<[1, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> + %1 = tosa.pad %arg0, %0 : (tensor, !tosa.shape<4>) -> tensor return %1 : tensor } // ----- // CHECK-LABEL: @pad_determine_val_quant -func.func @pad_determine_val_quant(%arg0: tensor, %arg1 : tensor<4xi32>) -> tensor { - // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<42> : tensor} - // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]] - %1 = tosa.pad %arg0, %arg1 {quantization_info = #tosa.pad_quant} : (tensor, tensor<4xi32>) -> tensor +func.func @pad_determine_val_quant(%arg0: tensor, %arg1 : tensor<2x2xi32>) -> tensor { + // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor} + // CHECK-DAG: %[[PADDING:.+]] = tosa.const_shape {value = dense<[1, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> + // CHECK: tosa.pad %arg0, %[[PADDING]], %[[ZERO]] + %0 = tosa.const_shape { value = dense<[1, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> + %1 = tosa.pad %arg0, %0 {input_zp = 42 : i32} : (tensor, !tosa.shape<4>) -> tensor return %1 : tensor } diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index deaa8e24423374..4808867b28bb97 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -165,52 +165,56 @@ func.func @test_concat_element_type_mismatch(%arg0 : tensor<1x2xf32>, %arg1 : te // ----- -func.func @test_pad_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> { - // expected-error@+1 {{'tosa.pad' op padding of pad is not constant}} - %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<6xi32>) -> tensor<13x21x3xf32> +func.func @test_pad_non_const(%arg0: tensor<13x21x3xf32>, %arg1: !tosa.shape<6>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.pad' op shape operand is not compile time resolvable}} + %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, !tosa.shape<6>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } // ----- func.func @test_pad_non_const(%arg0: tensor<13x21x3xi8>, %arg1: tensor) -> tensor<13x21x3xi8> { - %0 = "tosa.const"() {value = dense<[0, 0, 0, 1, 0, 1]> : tensor<6xi32>} : () -> tensor<6xi32> + %0 = tosa.const_shape {value = dense<[0, 0, 0, 1, 0, 1]> : tensor<6xindex>} : () -> !tosa.shape<6> // expected-error@+1 {{'tosa.pad' op pad_const of pad is not constant}} - %1 = tosa.pad %arg0, %0, %arg1 : (tensor<13x21x3xi8>, tensor<6xi32>, tensor) -> tensor<13x21x3xi8> + %1 = tosa.pad %arg0, %0, %arg1 : (tensor<13x21x3xi8>, !tosa.shape<6>, tensor) -> tensor<13x21x3xi8> return %1 : tensor<13x21x3xi8> } // ----- -func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>, %arg1: tensor<4xi32>) { +func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>) { + %padding = tosa.const_shape {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4> // expected-error@+1 {{'tosa.pad' op expect same input and output tensor rank.}} - %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<4xi32>) -> tensor<13x21x3xf32> + %1 = tosa.pad %arg0, %padding : (tensor<13x21xf32>, !tosa.shape<4>) -> tensor<13x21x3xf32> return } // ----- -func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) { - // expected-error@+1 {{'tosa.pad' op operand #1 must be 1D tensor of 32-bit signless integer or 64-bit signless integer values, but got 'tensor<2x2xi32>'}} - %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2x2xi32>) -> tensor<13x21xf32> +func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>) { + %0 = tosa.const_shape {value = dense<1> : tensor<6xindex>} : () -> !tosa.shape<6> + // expected-error@+1 {{'tosa.pad' op expected padding tensor dim 0 to have size 4 (2*rank(shape1)) but got size 6}} + %1 = tosa.pad %arg0, %0 : (tensor<13x21xf32>, !tosa.shape<6>) -> tensor<13x21xf32> return } // ----- -func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<4xi32>) { - %0 = "tosa.const"() {value = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32> - // expected-error@+1 {{'tosa.pad' op operand #2 must be 0D tensor of number values, but got 'tensor<1xf32>'}} - %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21xf32>, tensor<4xi32>, tensor<1xf32>) -> tensor<13x21xf32> +func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) { + %0 = tosa.const_shape {value = dense<1> : tensor<4xindex>} : () -> !tosa.shape<4> + %1 = "tosa.const"() {value = dense<3.14> : tensor<2xf32>} : () -> tensor<2xf32> + // expected-error@+1 {{'tosa.pad' op operand #2 must be 0D tensor of number values, but got 'tensor<2xf32>'}} + %2 = tosa.pad %arg0, %0, %1 : (tensor<13x21xf32>, !tosa.shape<4>, tensor<2xf32>) -> tensor<13x21xf32> return } // ----- -func.func @test_pad_padding_shape_mismatch(%arg0: tensor<13x21x3xf32>, %arg1: tensor<4xi32>) -> tensor<13x21x3xf32> { +func.func @test_pad_padding_shape_mismatch(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + %0 = tosa.const_shape {value = dense<1> : tensor<4xindex>} : () -> !tosa.shape<4> // expected-error@+1 {{'tosa.pad' op expected padding tensor dim 0 to have size 6 (2*rank(shape1)) but got size 4}} - %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<4xi32>) -> tensor<13x21x3xf32> - return %0 : tensor<13x21x3xf32> + %1 = tosa.pad %arg0, %0 : (tensor<13x21x3xf32>, !tosa.shape<4>) -> tensor<13x21x3xf32> + return %1 : tensor<13x21x3xf32> } // ----- diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index 690e208af1e5f9..563c5fa457d351 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -525,16 +525,18 @@ func.func @test_concat(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) - // ----- // CHECK-LABEL: pad -func.func @test_pad(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> { - %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<6xi32>) -> tensor<13x21x3xf32> +func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + %padding = tosa.const_shape {value = dense<0> : tensor<6xindex>} : () -> !tosa.shape<6> + %0 = tosa.pad %arg0, %padding : (tensor<13x21x3xf32>, !tosa.shape<6>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } // ----- // CHECK-LABEL: pad_explicit_value -func.func @test_pad_explicit_value(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> { +func.func @test_pad_explicit_value(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = "tosa.const"() {value = dense<3.14> : tensor} : () -> tensor - %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21x3xf32>, tensor<6xi32>, tensor) -> tensor<13x21x3xf32> + %padding = tosa.const_shape {value = dense<0> : tensor<6xindex>} : () -> !tosa.shape<6> + %1 = tosa.pad %arg0, %padding, %0 : (tensor<13x21x3xf32>, !tosa.shape<6>, tensor) -> tensor<13x21x3xf32> return %1 : tensor<13x21x3xf32> } diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir index 8df4630f9c17ff..95d9bb1b98ab74 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir @@ -58,9 +58,9 @@ func.func @conv_with_dynamic_dim(%arg0: tensor, %arg1: tensor<384 // CHECK-LABEL: @conv2d_as_fully_connected_padded func.func @conv2d_as_fully_connected_padded(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<3x1x1x2xi8>, %arg2: tensor<3xi32>) -> tensor<4x12x12x3xi32> { - // CHECK-DAG: %[[PAD_SHAPE:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi64>} + // CHECK-DAG: %[[PAD_SHAPE:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK-DAG: %[[PAD_VAL:.+]] = "tosa.const"() <{value = dense<42> : tensor} - // CHECK-DAG: %[[PAD:.+]] = tosa.pad %arg0, %[[PAD_SHAPE]], %[[PAD_VAL]] : (tensor<4x10x10x2xi8>, tensor<8xi64>, tensor) -> tensor<4x12x12x2xi8> + // CHECK-DAG: %[[PAD:.+]] = tosa.pad %arg0, %[[PAD_SHAPE]], %[[PAD_VAL]] : (tensor<4x10x10x2xi8>, !tosa.shape<8>, tensor) -> tensor<4x12x12x2xi8> // CHECK-DAG: %[[RESHAPE_INPUT:.+]] = tosa.reshape %[[PAD]] {new_shape = array} // CHECK-DAG: %[[RESHAPE_FILTER:.+]] = tosa.reshape %arg1 {new_shape = array} // CHECK-DAG: %[[FULLY:.+]] = tosa.fully_connected %[[RESHAPE_INPUT]], %[[RESHAPE_FILTER]], %arg2 {quantization_info = #tosa.conv_quant} diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir index cfff6396ad486d..bbcc206e1490c7 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir @@ -46,10 +46,10 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor< // CHECK-LABEL: @depthwise_conv2d_as_mul_padded func.func @depthwise_conv2d_as_mul_padded(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x12x12x6xf32> { - // CHECK-DAG: %[[pad:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xi64>} + // CHECK-DAG: %[[pad:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xindex>} : () -> !tosa.shape<10> // CHECK-DAG: %[[zero:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor} // CHECK: %[[reIn:.+]] = tosa.reshape %arg0 {new_shape = array} - // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, tensor<10xi64>, tensor) -> tensor<4x12x12x2x1xf32> + // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, !tosa.shape<10>, tensor) -> tensor<4x12x12x2x1xf32> // CHECK: %[[reArg1:.+]] = tosa.reshape %arg1 {new_shape = array} // CHECK: %[[mul:.+]] = tosa.mul %3, %[[reArg1]] {shift = 0 : i8} // CHECK: %[[reOut:.+]] = tosa.reshape %[[mul]] {new_shape = array} diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir index c361c7c2899fc3..96f71c349938b9 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir @@ -45,7 +45,7 @@ func.func @transpose_conv2d_quantized_padded(%arg0: tensor<2x16x14x3xi8>, %arg1: // CHECK-LABEL: @transpose_conv2d_strided func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<5x3x5x3xf32>, %arg2: tensor<5xf32>) -> tensor<2x?x?x5xf32> { // Manipulate the weight matrix to handle striding. - // CHECK-DAG: %[[PADV:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xi32>} + // CHECK-DAG: %[[PADV:.+]] = tosa.const_shape {value = dense<[0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK-DAG: %[[TRANSV:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>} // CHECK-DAG: %[[PADW:.+]] = tosa.pad %arg1, %[[PADV]] // CHECK-DAG: %[[RESW1:.+]] = tosa.reshape %[[PADW]] {new_shape = array} @@ -55,7 +55,7 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor< // CHECK-DAG: %[[NEWWEIGHT:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32} // Pad out the input matrix to handle the transpose conv. - // CHECK-DAG: %[[PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi32>} + // CHECK-DAG: %[[PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK-DAG: %[[TRANS2:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>} // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]] @@ -78,7 +78,7 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor< // CHECK-LABEL: @transpose_conv2d_strided_quantized func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1: tensor<5x3x5x3xi8>, %arg2: tensor<5xi32>) -> (tensor<2x35x47x5xi32>) { // Manipulate the weight matrix to handle striding. - // CHECK-DAG: %[[PADV:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xi32>} + // CHECK-DAG: %[[PADV:.+]] = tosa.const_shape {value = dense<[0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK-DAG: %[[TRANSV:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>} // CHECK-DAG: %[[PADW:.+]] = tosa.pad %arg1, %[[PADV]] {quantization_info = #tosa.pad_quant} // CHECK-DAG: %[[RESW1:.+]] = tosa.reshape %[[PADW]] {new_shape = array} @@ -88,7 +88,7 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1 // CHECK-DAG: %[[NEWWEIGHT:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32} // Pad out the input matrix to handle the transpose conv. - // CHECK-DAG: %[[PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi32>} + // CHECK-DAG: %[[PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK-DAG: %[[TRANS2:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>} // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]] {quantization_info = #tosa.pad_quant} @@ -109,12 +109,12 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1 // CHECK-LABEL: @transpose_conv2d_strided_overpad func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 : tensor<1x2x1x1xi8>, %arg2 : tensor<1xi32>) -> (tensor<1x19x2x1xi32>) { - // CHECK-DAG: %[[WEIGHT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 0, 0, 1, 0, 0]> : tensor<8xi32> + // CHECK-DAG: %[[WEIGHT_PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 0, 0, 0, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK-DAG: %[[WEIGHT_PERMS:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>} - // CHECK-DAG: %[[INPUT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 0, 0, 0, 0]> : tensor<8xi32>} + // CHECK-DAG: %[[INPUT_PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 0, 0, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<2xi32>} // CHECK-DAG: %[[RESULT_PERMS:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>} - // CHECK-DAG: %[[RESULT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 2, 0, 0, 0, 0, 0]> : tensor<8xi32>} + // CHECK-DAG: %[[RESULT_PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 2, 0, 0, 0, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> // CHECK: %[[PAD_WEIGHT:.+]] = tosa.pad %arg1, %[[WEIGHT_PAD]] {quantization_info = #tosa.pad_quant} // CHECK: %[[RESHAPE_WEIGHT_0:.+]] = tosa.reshape %[[PAD_WEIGHT]] {new_shape = array} // CHECK: %[[TRANSPOSE_WEIGHT:.+]] = tosa.transpose %[[RESHAPE_WEIGHT_0]], %[[WEIGHT_PERMS]] diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir index 44cc6acd7e97a0..6beb1ad6296135 100644 --- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir @@ -492,22 +492,14 @@ func.func @test_concat_axis_1(%arg0 : tensor<2x1xf32>, %arg1 : tensor<2x2xf32>) return } -// ----- - -// CHECK-LABEL: @test_padding_no_const -func.func @test_padding_no_const(%arg0 : tensor<1x2xf32>, %arg1 : tensor<4xi32>) -> () { - // CHECK: tosa.pad %arg0, %arg1 : (tensor<1x2xf32>, tensor<4xi32>) -> tensor - %0 = tosa.pad %arg0, %arg1 : (tensor<1x2xf32>, tensor<4xi32>) -> tensor - return -} // ----- // CHECK-LABEL:@test_padding_dynamic_input func.func @test_padding_dynamic_input(%arg0 : tensor<1x?xf32>) -> () { - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> - // CHECK: tosa.pad %arg0, %cst : (tensor<1x?xf32>, tensor<4xi32>) -> tensor<4x?xf32> - %1 = tosa.pad %arg0, %0 : (tensor<1x?xf32>, tensor<4xi32>) -> tensor + %0 = tosa.const_shape { value = dense<[1, 2, 3, 4]> : tensor<4xindex> } : () -> !tosa.shape<4> + // CHECK: tosa.pad %arg0, %0 : (tensor<1x?xf32>, !tosa.shape<4>) -> tensor<4x?xf32> + %1 = tosa.pad %arg0, %0 : (tensor<1x?xf32>, !tosa.shape<4>) -> tensor return } @@ -515,9 +507,9 @@ func.func @test_padding_dynamic_input(%arg0 : tensor<1x?xf32>) -> () { // CHECK-LABEL: @test_padding_simple func.func @test_padding_simple(%arg0 : tensor<1x2xf32>) -> () { - %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> - // CHECK: tosa.pad %arg0, %cst : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<4x9xf32> - %1 = tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<4xi32>) -> tensor + %0 = tosa.const_shape { value = dense<[1, 2, 3, 4]> : tensor<4xindex> } : () -> !tosa.shape<4> + // CHECK: tosa.pad %arg0, %0 : (tensor<1x2xf32>, !tosa.shape<4>) -> tensor<4x9xf32> + %1 = tosa.pad %arg0, %0 : (tensor<1x2xf32>, !tosa.shape<4>) -> tensor return } From 719f0d92538c917306004e541f38c79717d0c07d Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Wed, 22 Jan 2025 12:39:35 -0800 Subject: [PATCH 035/208] [HLSL] Fix global resource initialization (#123394) Create separate resource initialization function for each resource and add them to CodeGenModule's `CXXGlobalInits` list. Fixes #120636 and addresses this [comment ](https://github.com/llvm/llvm-project/pull/119755/files#r1894093603). --- clang/lib/CodeGen/CGDeclCXX.cpp | 8 -- clang/lib/CodeGen/CGHLSLRuntime.cpp | 130 +++++++++--------- clang/lib/CodeGen/CGHLSLRuntime.h | 5 - clang/lib/CodeGen/CodeGenModule.h | 2 + .../ByteAddressBuffers-constructors.hlsl | 25 ++-- .../builtins/RWBuffer-constructor-opt.hlsl | 8 +- .../builtins/RWBuffer-constructor.hlsl | 15 +- .../StructuredBuffers-constructors.hlsl | 51 ++++--- clang/test/CodeGenHLSL/resource-bindings.hlsl | 17 ++- 9 files changed, 128 insertions(+), 133 deletions(-) diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index 96517511b21114..1c2fecea1a6ac2 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -1131,14 +1131,6 @@ CodeGenFunction::GenerateCXXGlobalInitFunc(llvm::Function *Fn, if (Decls[i]) EmitRuntimeCall(Decls[i]); - if (getLangOpts().HLSL) { - CGHLSLRuntime &CGHLSL = CGM.getHLSLRuntime(); - if (CGHLSL.needsResourceBindingInitFn()) { - llvm::Function *ResInitFn = CGHLSL.createResourceBindingInitFn(); - Builder.CreateCall(llvm::FunctionCallee(ResInitFn), {}); - } - } - Scope.ForceCleanup(); if (ExitBlock) { diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 5679bd71581795..345e218f424514 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -536,89 +536,85 @@ void CGHLSLRuntime::generateGlobalCtorDtorCalls() { } } -void CGHLSLRuntime::handleGlobalVarDefinition(const VarDecl *VD, - llvm::GlobalVariable *GV) { - // If the global variable has resource binding, add it to the list of globals - // that need resource binding initialization. - const HLSLResourceBindingAttr *RBA = VD->getAttr(); - if (!RBA) - return; - - if (!HLSLAttributedResourceType::findHandleTypeOnResource( - VD->getType().getTypePtr())) - // FIXME: Only simple declarations of resources are supported for now. - // Arrays of resources or resources in user defined classes are - // not implemented yet. - return; - - ResourcesToBind.emplace_back(VD, GV); -} - -bool CGHLSLRuntime::needsResourceBindingInitFn() { - return !ResourcesToBind.empty(); +// Returns true if the type is an HLSL resource class +static bool isResourceRecordType(const clang::Type *Ty) { + return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr; } -llvm::Function *CGHLSLRuntime::createResourceBindingInitFn() { - // No resources to bind - assert(needsResourceBindingInitFn() && "no resources to bind"); - +static void createResourceInitFn(CodeGenModule &CGM, const VarDecl *VD, + llvm::GlobalVariable *GV, unsigned Slot, + unsigned Space) { LLVMContext &Ctx = CGM.getLLVMContext(); llvm::Type *Int1Ty = llvm::Type::getInt1Ty(Ctx); - llvm::Function *InitResBindingsFunc = - llvm::Function::Create(llvm::FunctionType::get(CGM.VoidTy, false), - llvm::GlobalValue::InternalLinkage, - "_init_resource_bindings", CGM.getModule()); + llvm::Function *InitResFunc = llvm::Function::Create( + llvm::FunctionType::get(CGM.VoidTy, false), + llvm::GlobalValue::InternalLinkage, + ("_init_resource_" + VD->getName()).str(), CGM.getModule()); + InitResFunc->addFnAttr(llvm::Attribute::AlwaysInline); llvm::BasicBlock *EntryBB = - llvm::BasicBlock::Create(Ctx, "entry", InitResBindingsFunc); + llvm::BasicBlock::Create(Ctx, "entry", InitResFunc); CGBuilderTy Builder(CGM, Ctx); const DataLayout &DL = CGM.getModule().getDataLayout(); Builder.SetInsertPoint(EntryBB); - for (const auto &[VD, GV] : ResourcesToBind) { - for (Attr *A : VD->getAttrs()) { - HLSLResourceBindingAttr *RBA = dyn_cast(A); - if (!RBA) - continue; - - const HLSLAttributedResourceType *AttrResType = - HLSLAttributedResourceType::findHandleTypeOnResource( - VD->getType().getTypePtr()); - - // FIXME: Only simple declarations of resources are supported for now. - // Arrays of resources or resources in user defined classes are - // not implemented yet. - assert(AttrResType != nullptr && - "Resource class must have a handle of HLSLAttributedResourceType"); - - llvm::Type *TargetTy = - CGM.getTargetCodeGenInfo().getHLSLType(CGM, AttrResType); - assert(TargetTy != nullptr && - "Failed to convert resource handle to target type"); - - auto *Space = llvm::ConstantInt::get(CGM.IntTy, RBA->getSpaceNumber()); - auto *Slot = llvm::ConstantInt::get(CGM.IntTy, RBA->getSlotNumber()); + const HLSLAttributedResourceType *AttrResType = + HLSLAttributedResourceType::findHandleTypeOnResource( + VD->getType().getTypePtr()); + + // FIXME: Only simple declarations of resources are supported for now. + // Arrays of resources or resources in user defined classes are + // not implemented yet. + assert(AttrResType != nullptr && + "Resource class must have a handle of HLSLAttributedResourceType"); + + llvm::Type *TargetTy = + CGM.getTargetCodeGenInfo().getHLSLType(CGM, AttrResType); + assert(TargetTy != nullptr && + "Failed to convert resource handle to target type"); + + llvm::Value *Args[] = { + llvm::ConstantInt::get(CGM.IntTy, Space), /* reg_space */ + llvm::ConstantInt::get(CGM.IntTy, Slot), /* lower_bound */ // FIXME: resource arrays are not yet implemented - auto *Range = llvm::ConstantInt::get(CGM.IntTy, 1); - auto *Index = llvm::ConstantInt::get(CGM.IntTy, 0); + llvm::ConstantInt::get(CGM.IntTy, 1), /* range_size */ + llvm::ConstantInt::get(CGM.IntTy, 0), /* index */ // FIXME: NonUniformResourceIndex bit is not yet implemented - auto *NonUniform = llvm::ConstantInt::get(Int1Ty, false); - llvm::Value *Args[] = {Space, Slot, Range, Index, NonUniform}; + llvm::ConstantInt::get(Int1Ty, false) /* non-uniform */ + }; + llvm::Value *CreateHandle = Builder.CreateIntrinsic( + /*ReturnType=*/TargetTy, + CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic(), Args, nullptr, + Twine(VD->getName()).concat("_h")); + + llvm::Value *HandleRef = Builder.CreateStructGEP(GV->getValueType(), GV, 0); + Builder.CreateAlignedStore(CreateHandle, HandleRef, + HandleRef->getPointerAlignment(DL)); + Builder.CreateRetVoid(); - llvm::Value *CreateHandle = Builder.CreateIntrinsic( - /*ReturnType=*/TargetTy, getCreateHandleFromBindingIntrinsic(), Args, - nullptr, Twine(VD->getName()).concat("_h")); + CGM.AddCXXGlobalInit(InitResFunc); +} - llvm::Value *HandleRef = - Builder.CreateStructGEP(GV->getValueType(), GV, 0); - Builder.CreateAlignedStore(CreateHandle, HandleRef, - HandleRef->getPointerAlignment(DL)); - } - } +void CGHLSLRuntime::handleGlobalVarDefinition(const VarDecl *VD, + llvm::GlobalVariable *GV) { - Builder.CreateRetVoid(); - return InitResBindingsFunc; + // If the global variable has resource binding, create an init function + // for the resource + const HLSLResourceBindingAttr *RBA = VD->getAttr(); + if (!RBA) + // FIXME: collect unbound resources for implicit binding resolution later + // on? + return; + + if (!isResourceRecordType(VD->getType().getTypePtr())) + // FIXME: Only simple declarations of resources are supported for now. + // Arrays of resources or resources in user defined classes are + // not implemented yet. + return; + + createResourceInitFn(CGM, VD, GV, RBA->getSlotNumber(), + RBA->getSpaceNumber()); } llvm::Instruction *CGHLSLRuntime::getConvergenceToken(BasicBlock &BB) { diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index f9dc7b87af0e34..032b2dee82f211 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -159,8 +159,6 @@ class CGHLSLRuntime { void setHLSLFunctionAttributes(const FunctionDecl *FD, llvm::Function *Fn); void handleGlobalVarDefinition(const VarDecl *VD, llvm::GlobalVariable *Var); - bool needsResourceBindingInitFn(); - llvm::Function *createResourceBindingInitFn(); llvm::Instruction *getConvergenceToken(llvm::BasicBlock &BB); private: @@ -173,9 +171,6 @@ class CGHLSLRuntime { void addBufferDecls(const DeclContext *DC, Buffer &CB); llvm::Triple::ArchType getArch(); llvm::SmallVector Buffers; - - llvm::SmallVector> - ResourcesToBind; }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index d5ef1a710eb403..1aa5d483d49c08 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1226,6 +1226,8 @@ class CodeGenModule : public CodeGenTypeCache { llvm::Function *getIntrinsic(unsigned IID, ArrayRef Tys = {}); + void AddCXXGlobalInit(llvm::Function *F) { CXXGlobalInits.push_back(F); } + /// Emit code for a single top level declaration. void EmitTopLevelDecl(Decl *D); diff --git a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl index 7507e741a9c9ba..7fc6f4bb05745b 100644 --- a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl +++ b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl @@ -15,15 +15,20 @@ RasterizerOrderedByteAddressBuffer Buffer2: register(u3, space4); // CHECK: @Buffer1 = global %"class.hlsl::RWByteAddressBuffer" zeroinitializer, align 4 // CHECK: @Buffer2 = global %"class.hlsl::RasterizerOrderedByteAddressBuffer" zeroinitializer, align 4 +// CHECK; define internal void @_init_resource_Buffer0() +// CHECK-DXIL: %Buffer0_h = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", i8, 0, 0) %Buffer0_h, ptr @Buffer0, align 4 + +// CHECK; define internal void @_init_resource_Buffer1() +// CHECK-DXIL: %Buffer1_h = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_1_0t(i32 2, i32 1, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", i8, 1, 0) %Buffer1_h, ptr @Buffer1, align 4 + +// CHECK; define internal void @_init_resource_Buffer2() +// CHECK-DXIL: %Buffer2_h = call target("dx.RawBuffer", i8, 1, 1) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_1_1t(i32 4, i32 3, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", i8, 1, 1) %Buffer2_h, ptr @Buffer2, align 4 + // CHECK: define internal void @_GLOBAL__sub_I_ByteAddressBuffers_constructors.hlsl() // CHECK: entry: -// CHECK: call void @_init_resource_bindings() - -// CHECK: define internal void @_init_resource_bindings() { -// CHECK-NEXT: entry: -// CHECK-DXIL-NEXT: %Buffer0_h = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 0, 0) %Buffer0_h, ptr @Buffer0, align 4 -// CHECK-DXIL-NEXT: %Buffer1_h = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_1_0t(i32 2, i32 1, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 1, 0) %Buffer1_h, ptr @Buffer1, align 4 -// CHECK-DXIL-NEXT: %Buffer2_h = call target("dx.RawBuffer", i8, 1, 1) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_1_1t(i32 4, i32 3, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 1, 1) %Buffer2_h, ptr @Buffer2, align 4 +// CHECK: call void @_init_resource_Buffer0() +// CHECK: call void @_init_resource_Buffer1() +// CHECK: call void @_init_resource_Buffer2() diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl index 237b97394024c6..03f22620a097d9 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl @@ -11,11 +11,11 @@ void main() { // CHECK: define void @main() // CHECK-NEXT: entry: -// CHECK-SPIRV-NEXT: %Buf_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false) -// CHECK-SPIRV-NEXT: store target("spirv.Image", float, 5, 2, 0, 0, 2, 0) %Buf_h.i, ptr @Buf, align 8 +// CHECK-SPIRV-NEXT: %[[HANDLE:.*]] = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false) +// CHECK-SPIRV-NEXT: store target("spirv.Image", float, 5, 2, 0, 0, 2, 0) %[[HANDLE:.*]], ptr @Buf, align 8 -// CHECK-DXIL-NEXT: %Buf_h.i = tail call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h.i, ptr @Buf, align 4 +// CHECK-DXIL-NEXT: %[[HANDLE:.*]] = tail call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) +// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %[[HANDLE]], ptr @Buf, align 4 // CHECK-NEXT: ret void } diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl index e4226abf71b8ec..d7cc3892a404bb 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl @@ -9,17 +9,12 @@ RWBuffer Buf : register(u5, space3); // CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", float, 1, 0, 0) } // CHECK: @Buf = global %"class.hlsl::RWBuffer" zeroinitializer, align 4 +// CHECK: define internal void @_init_resource_Buf() +// CHECK-DXIL: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4 + // CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) // CHECK-NEXT: entry: // CHECK: define internal void @_GLOBAL__sub_I_RWBuffer_constructor.hlsl() -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__cxx_global_var_init() -// CHECK-NEXT: call void @_init_resource_bindings() - -// CHECK: define internal void @_init_resource_bindings() { -// CHECK-NEXT: entry: -// CHECK-DXIL-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4 -// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.spv.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) -// CHECK-SPIRV-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4 +// CHECK: call void @_init_resource_Buf() diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl index 16f4f80231dae4..bd931181045ba5 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl @@ -21,6 +21,26 @@ RasterizerOrderedStructuredBuffer Buf5 : register(u1, space2); // CHECK: @Buf4 = global %"class.hlsl::ConsumeStructuredBuffer" zeroinitializer, align 4 // CHECK: @Buf5 = global %"class.hlsl::RasterizerOrderedStructuredBuffer" zeroinitializer, align 4 +// CHECK: define internal void @_init_resource_Buf() +// CHECK-DXIL: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", float, 0, 0) %Buf_h, ptr @Buf, align 4 + +// CHECK: define internal void @_init_resource_Buf2() +// CHECK-DXIL: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) %Buf2_h, ptr @Buf2, align 4 + +// CHECK: define internal void @_init_resource_Buf3() +// CHECK-DXIL: %Buf3_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) %Buf3_h, ptr @Buf3, align 4 + +// CHECK: define internal void @_init_resource_Buf4() +// CHECK-DXIL: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) %Buf4_h, ptr @Buf4, align 4 + +// CHECK: define internal void @_init_resource_Buf5() +// CHECK-DXIL: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false) +// CHECK-DXIL: store target("dx.RawBuffer", float, 1, 1) %Buf5_h, ptr @Buf5, align 4 + // CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) // CHECK-NEXT: entry: // CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) @@ -32,29 +52,8 @@ RasterizerOrderedStructuredBuffer Buf5 : register(u1, space2); // CHECK-NEXT: entry: // CHECK: define internal void @_GLOBAL__sub_I_StructuredBuffers_constructors.hlsl() -// CHECK: entry: -// CHECK: call void @_init_resource_bindings() - -// CHECK: define internal void @_init_resource_bindings() { -// CHECK-NEXT: entry: -// CHECK-DXIL-NEXT: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf_h, ptr @Buf, align 4 -// CHECK-DXIL-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf2_h, ptr @Buf2, align 4 -// CHECK-DXIL-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf3_h, ptr @Buf3, align 4 -// CHECK-DXIL-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf4_h, ptr @Buf4, align 4 -// CHECK-DXIL-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false) -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 1) %Buf5_h, ptr @Buf5, align 4 - -// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false) -// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf_h, ptr @Buf", align 4 -// CHECK-SPIRV-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false) -// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf2_h, ptr @Buf2", align 4 -// CHECK-SPIRV-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false) -// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf3_h, ptr @Buf3, align 4 -// CHECK-SPIRV-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false) -// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf4_h, ptr @Buf4, align 4 -// CHECK-SPIRV-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false) -// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 1) %Buf5_h, ptr @Buf5, align 4 +// CHECK: call void @_init_resource_Buf() +// CHECK: call void @_init_resource_Buf2() +// CHECK: call void @_init_resource_Buf3() +// CHECK: call void @_init_resource_Buf4() +// CHECK: call void @_init_resource_Buf5() diff --git a/clang/test/CodeGenHLSL/resource-bindings.hlsl b/clang/test/CodeGenHLSL/resource-bindings.hlsl index 4049a87a8ab712..bfa7896bd98114 100644 --- a/clang/test/CodeGenHLSL/resource-bindings.hlsl +++ b/clang/test/CodeGenHLSL/resource-bindings.hlsl @@ -1,13 +1,14 @@ -// RUN: %clang_cc1 -triple dxil--shadermodel6.6-compute -x hlsl -finclude-default-header -emit-llvm -o - %s | FileCheck %s - -// CHECK: define internal void @_init_resource_bindings() { +// RUN: %clang_cc1 -triple dxil--shadermodel6.6-compute -x hlsl -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// CHECK: define internal void @_init_resource_U0S0() // CHECK: %U0S0_h = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false) RWBuffer U0S0 : register(u0); +// CHECK: define internal void @_init_resource_U5S3() // CHECK: %U5S3_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) RWBuffer U5S3 : register(u5, space3); +// CHECK: define internal void @_init_resource_T2S2() // CHECK: %T2S2_h = call target("dx.RawBuffer", i32, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i32_0_0t(i32 2, i32 2, i32 1, i32 0, i1 false) StructuredBuffer T2S2 : register(t2, space2); struct S { @@ -15,5 +16,15 @@ struct S { int i; }; +// CHECK: define internal void @_init_resource_T3S0() // CHECK: %T3S0_h = call target("dx.RawBuffer", %struct.S, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_s_struct.Ss_0_0t(i32 0, i32 3, i32 1, i32 0, i1 false) StructuredBuffer T3S0 : register(t3); + +// CHECK: define void @main() +// CHECK: call void @_init_resource_U0S0() +// CHECK: call void @_init_resource_U5S3() +// CHECK: call void @_init_resource_T2S2() +// CHECK: call void @_init_resource_T3S0() + +[numthreads(4,1,1)] +void main() {} From 1cf0af3d321e3aca57e348b9c0675d153c7b6968 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 22 Jan 2025 15:56:58 -0500 Subject: [PATCH 036/208] [AMDGPU][True16][MC] true16 for v_cmpx_class_f16 (#123251) True16 format for v_cmpx_class_f16. Update VOPCX_CLASS t16 and fake16 pseudo. --- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 66 +++++++---- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s | 65 ++++++----- .../AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s | 21 +++- .../MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s | 29 +++-- llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s | 75 ++++++++----- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s | 65 ++++++----- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s | 21 +++- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s | 42 +++++-- .../MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s | 42 +++++-- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s | 25 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s | 73 ++++++------ llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s | 29 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s | 72 +++++++----- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s | 62 +++++----- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s | 18 ++- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s | 42 +++++-- .../MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s | 42 +++++-- .../gfx11_dasm_vop3_dpp16_from_vopcx.txt | 54 ++++++--- .../gfx11_dasm_vop3_dpp8_from_vopcx.txt | 18 ++- .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt | 27 ++++- .../Disassembler/AMDGPU/gfx11_dasm_vopcx.txt | 65 ++++++++--- .../AMDGPU/gfx11_dasm_vopcx_dpp16.txt | 54 ++++++--- .../AMDGPU/gfx11_dasm_vopcx_dpp8.txt | 18 ++- .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt | 24 +++- .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt | 106 +++++++++++++++--- .../AMDGPU/gfx12_dasm_vop3cx_dpp8.txt | 21 +++- .../Disassembler/AMDGPU/gfx12_dasm_vopcx.txt | 61 +++++++--- .../AMDGPU/gfx12_dasm_vopcx_dpp16.txt | 50 ++++++--- .../AMDGPU/gfx12_dasm_vopcx_dpp8.txt | 14 ++- 29 files changed, 907 insertions(+), 394 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 91ad2cafe9b54b..0f80271686d4a6 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -957,41 +957,69 @@ multiclass VOPC_Class_Profile_t16 sched> { } class VOPC_Class_NoSdst_Profile sched, ValueType src0VT, ValueType src1VT = i32> : - VOPC_Class_Profile { + VOPC_Class_Profile_Base { let Outs64 = (outs ); let OutsSDWA = (outs ); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, src0_sel:$src0_sel, src1_sel:$src1_sel); - let AsmVOP3Base = "$src0_modifiers, $src1"; + let HasDst = 0; let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let EmitDst = 0; } multiclass VOPC_Class_NoSdst_Profile_t16 sched> { def NAME : VOPC_Class_NoSdst_Profile; - def _t16 : VOPC_Class_NoSdst_Profile { + def _t16 : VOPC_Class_NoSdst_Profile { let IsTrue16 = 1; let IsRealTrue16 = 1; - let Src1RC32 = getVregSrcForVT.ret; - let Src1RC64 = VSrc_b32; - let Src0DPP = getVregSrcForVT.ret; - let Src1DPP = getVregSrcForVT.ret; - let Src2DPP = getVregSrcForVT.ret; - let Src0ModDPP = getSrcModDPP_t16.ret; - let Src1ModDPP = getSrcModDPP_t16.ret; - let Src2ModDPP = getSrcModDPP_t16.ret; + let HasOpSel = 1; + let HasModifiers = 1; // All instructions at least have OpSel + let Src0RC32 = getVOPSrc0ForVT.ret; + let Src1RC32 = getVregSrcForVT.ret; + let Src0DPP = getVregSrcForVT.ret; + let Src1DPP = getVregSrcForVT.ret; + let Src2DPP = getVregSrcForVT.ret; + let Src0ModDPP = getSrcModDPP_t16.ret; + let Src1ModDPP = getSrcModDPP_t16.ret; + let Src2ModDPP = getSrcModDPP_t16.ret; + let Src0VOP3DPP = VGPRSrc_16; + let Src1VOP3DPP = getVOP3DPPSrcForVT.ret; + let Src2VOP3DPP = getVOP3DPPSrcForVT.ret; + + let Src0RC64 = getVOP3SrcForVT.ret; + let Src1RC64 = getVOP3SrcForVT.ret; + let Src2RC64 = getVOP3SrcForVT.ret; + let Src0Mod = getSrc0Mod.ret; + let Src1Mod = getSrcMod.ret; + let Src2Mod = getSrcMod.ret; + let Src0ModVOP3DPP = getSrc0ModVOP3DPP.ret; + let Src1ModVOP3DPP = getSrcModVOP3DPP.ret; + let Src2ModVOP3DPP = getSrcModVOP3DPP.ret; } - def _fake16 : VOPC_Class_NoSdst_Profile { + def _fake16 : VOPC_Class_NoSdst_Profile { let IsTrue16 = 1; + let Src0RC32 = getVOPSrc0ForVT.ret; let Src1RC32 = getVregSrcForVT.ret; - let Src1RC64 = VSrc_b32; let Src0DPP = getVregSrcForVT.ret; let Src1DPP = getVregSrcForVT.ret; let Src2DPP = getVregSrcForVT.ret; - let Src0ModDPP = getSrcModDPP_t16.ret; - let Src1ModDPP = getSrcModDPP_t16.ret; - let Src2ModDPP = getSrcModDPP_t16.ret; + let Src0ModDPP = getSrcModDPP_t16.ret; + let Src1ModDPP = getSrcModDPP_t16.ret; + let Src2ModDPP = getSrcModDPP_t16.ret; + let Src0VOP3DPP = VGPRSrc_32; + let Src1VOP3DPP = getVOP3DPPSrcForVT.ret; + let Src2VOP3DPP = getVOP3DPPSrcForVT.ret; + + let Src0RC64 = getVOP3SrcForVT.ret; + let Src1RC64 = getVOP3SrcForVT.ret; + let Src2RC64 = getVOP3SrcForVT.ret; + let Src0Mod = getSrc0Mod.ret; + let Src1Mod = getSrcMod.ret; + let Src2Mod = getSrcMod.ret; + let Src0ModVOP3DPP = getSrc0ModVOP3DPP.ret; + let Src1ModVOP3DPP = getSrcModVOP3DPP.ret; + let Src2ModVOP3DPP = getSrcModVOP3DPP.ret; } } @@ -1141,10 +1169,10 @@ multiclass VOPCX_CLASS_F16 { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { defm NAME : VOPCX_Class_Pseudos ; } - let OtherPredicates = [UseRealTrue16Insts] in { + let True16Predicate = UseRealTrue16Insts in { defm _t16 : VOPCX_Class_Pseudos ; } - let OtherPredicates = [UseFakeTrue16Insts] in { + let True16Predicate = UseFakeTrue16Insts in { defm _fake16 : VOPCX_Class_Pseudos ; } } @@ -2044,7 +2072,7 @@ defm V_CMPX_GT_U64 : VOPCX_Real_gfx11_gfx12<0x0dc>; defm V_CMPX_NE_U64 : VOPCX_Real_gfx11_gfx12<0x0dd>; defm V_CMPX_GE_U64 : VOPCX_Real_gfx11_gfx12<0x0de>; defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>; -defm V_CMPX_CLASS_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; +defm V_CMPX_CLASS_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11_gfx12<0x0fe>; defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s index 60ec94446235ed..379142e84aabdb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s @@ -3,47 +3,56 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] + +v_cmpx_class_f16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_class_f16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x11,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s index fb2b28874bd04f..4d6928ecbbc767 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s @@ -2,14 +2,23 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cmpx_class_f16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_class_f16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x11,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s index 7a95d8cd53cde4..0d8dc8b1bbc8b4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s @@ -2,17 +2,17 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -v_cmpx_class_f16_e64 v1, v2 -// GFX11: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_class_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_class_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_class_f16_e64 v255, v2 -// GFX11: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_class_f16_e64 v255.l, v2.l +// GFX11: v_cmpx_class_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_class_f16_e64 s1, v2 -// GFX11: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] +v_cmpx_class_f16_e64 s1, v2.l +// GFX11: v_cmpx_class_f16_e64 s1, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] -v_cmpx_class_f16_e64 s105, v255 -// GFX11: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] +v_cmpx_class_f16_e64 s105, v255.l +// GFX11: v_cmpx_class_f16_e64 s105, v255.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] v_cmpx_class_f16_e64 vcc_lo, s2 // GFX11: v_cmpx_class_f16_e64 vcc_lo, s2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00] @@ -47,8 +47,17 @@ v_cmpx_class_f16_e64 src_scc, vcc_lo v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi // GFX11: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] -v_cmpx_class_f16_e64 v1, 0.5 -// GFX11: v_cmpx_class_f16_e64 v1, 0.5 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00] +v_cmpx_class_f16_e64 v1.l, 0.5 +// GFX11: v_cmpx_class_f16_e64 v1.l, 0.5 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00] + +v_cmpx_class_f16_e64 v1.h, v2.h +// GFX11: v_cmpx_class_f16_e64 v1.h, v2.h ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_class_f16_e64 v255.h, v2.l +// GFX11: v_cmpx_class_f16_e64 v255.h, v2.l ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_class_f16_e64 s105, v255.h +// GFX11: v_cmpx_class_f16_e64 s105, v255.h ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] v_cmpx_class_f32_e64 v1, v2 // GFX11: v_cmpx_class_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s index 42d7c5ea600b41..d3eff378e630f2 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s @@ -2,50 +2,65 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -v_cmpx_class_f16_e32 v1, v2 -// GFX11: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] +v_cmpx_class_f16 v1.l, v2.l +// GFX11: v_cmpx_class_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0xfa,0x7d] -v_cmpx_class_f16 v127, v2 -// GFX11: v_cmpx_class_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0xfa,0x7d] +v_cmpx_class_f16 v127.l, v2.l +// GFX11: v_cmpx_class_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0xfa,0x7d] -v_cmpx_class_f16 s1, v2 -// GFX11: v_cmpx_class_f16_e32 s1, v2 ; encoding: [0x01,0x04,0xfa,0x7d] +v_cmpx_class_f16 s1, v2.l +// GFX11: v_cmpx_class_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0xfa,0x7d] -v_cmpx_class_f16 s105, v2 -// GFX11: v_cmpx_class_f16_e32 s105, v2 ; encoding: [0x69,0x04,0xfa,0x7d] +v_cmpx_class_f16 s105, v2.l +// GFX11: v_cmpx_class_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0xfa,0x7d] -v_cmpx_class_f16 vcc_lo, v2 -// GFX11: v_cmpx_class_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0xfa,0x7d] +v_cmpx_class_f16 vcc_lo, v2.l +// GFX11: v_cmpx_class_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0xfa,0x7d] -v_cmpx_class_f16 vcc_hi, v2 -// GFX11: v_cmpx_class_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0xfa,0x7d] +v_cmpx_class_f16 vcc_hi, v2.l +// GFX11: v_cmpx_class_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0xfa,0x7d] -v_cmpx_class_f16 ttmp15, v2 -// GFX11: v_cmpx_class_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0xfa,0x7d] +v_cmpx_class_f16 ttmp15, v2.l +// GFX11: v_cmpx_class_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0xfa,0x7d] -v_cmpx_class_f16 m0, v2 -// GFX11: v_cmpx_class_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0xfa,0x7d] +v_cmpx_class_f16 m0, v2.l +// GFX11: v_cmpx_class_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0xfa,0x7d] -v_cmpx_class_f16 exec_lo, v2 -// GFX11: v_cmpx_class_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7d] +v_cmpx_class_f16 exec_lo, v2.l +// GFX11: v_cmpx_class_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0xfa,0x7d] -v_cmpx_class_f16 exec_hi, v2 -// GFX11: v_cmpx_class_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7d] +v_cmpx_class_f16 exec_hi, v2.l +// GFX11: v_cmpx_class_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0xfa,0x7d] -v_cmpx_class_f16 null, v2 -// GFX11: v_cmpx_class_f16_e32 null, v2 ; encoding: [0x7c,0x04,0xfa,0x7d] +v_cmpx_class_f16 null, v2.l +// GFX11: v_cmpx_class_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0xfa,0x7d] -v_cmpx_class_f16 -1, v2 -// GFX11: v_cmpx_class_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0xfa,0x7d] +v_cmpx_class_f16 -1, v2.l +// GFX11: v_cmpx_class_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0xfa,0x7d] -v_cmpx_class_f16 0.5, v2 -// GFX11: v_cmpx_class_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0xfa,0x7d] +v_cmpx_class_f16 0.5, v2.l +// GFX11: v_cmpx_class_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0xfa,0x7d] -v_cmpx_class_f16 src_scc, v2 -// GFX11: v_cmpx_class_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7d] +v_cmpx_class_f16 src_scc, v2.l +// GFX11: v_cmpx_class_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0xfa,0x7d] -v_cmpx_class_f16 0xfe0b, v127 -// GFX11: v_cmpx_class_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_class_f16 0xfe0b, v127.l +// GFX11: v_cmpx_class_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_class_f16 v1.h, v2.l +// GFX11: v_cmpx_class_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0xfa,0x7d] + +v_cmpx_class_f16 v127.h, v2.l +// GFX11: v_cmpx_class_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0xfa,0x7d] + +v_cmpx_class_f16 0.5, v127.l +// GFX11: v_cmpx_class_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0xfa,0x7d] + +v_cmpx_class_f16 src_scc, v2.h +// GFX11: v_cmpx_class_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0xfb,0x7d] + +v_cmpx_class_f16 0xfe0b, v127.h +// GFX11: v_cmpx_class_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_class_f32 v1, v2 // GFX11: v_cmpx_class_f32_e32 v1, v2 ; encoding: [0x01,0x05,0xfc,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s index 57185330971e1a..2b565fa43bc2b8 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s @@ -2,47 +2,56 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_class_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_class_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_class_f16 v1, v2 row_mirror -// GFX11: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_class_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_class_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_class_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x35,0x30] +v_cmpx_class_f16 -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_class_f16 -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x35,0x30] + +v_cmpx_class_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_class_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_class_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_class_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0xfb,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_class_f16 -|v127.h|, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_class_f16 -|v127.h|, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x35,0x30] v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s index e78840e08c4974..5b2e9ae507b0c3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s @@ -2,14 +2,23 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_class_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_class_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_class_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0xfa,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_class_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_class_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_class_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_class_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00] v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s index 7c9fa7f846d47a..50a30ecf3ba122 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s @@ -1,23 +1,41 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s -v_cmpx_class_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_class_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_class_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_class_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_class_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_class_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_class_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction -v_cmpx_class_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction v_cmpx_eq_f16_e32 v1, v255 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s index bffe5c7251ddf1..b7e1976a7ccf92 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s @@ -1,23 +1,41 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 %s -v_cmpx_class_f16 v1, v255 -// GFX11: v_cmpx_class_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_class_f16 v1.h, v255.h +// GFX11: v_cmpx_class_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_class_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_class_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_class_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16 v255, v2 -// GFX11: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_class_f16 v1.l, v255.l +// GFX11: v_cmpx_class_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_class_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_class_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_class_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_class_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_class_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_class_f16 v255.h, v2.h +// GFX11: v_cmpx_class_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xfd,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_class_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_class_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_class_f16 v255.l, v2.l +// GFX11: v_cmpx_class_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_class_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_f16 v1, v255 // GFX11: v_cmpx_eq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s index 6730482540060c..1540f498c0b21a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s @@ -2,17 +2,17 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -v_cmpx_class_f16_e64 v1, v2 -// GFX12: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_class_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_class_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_class_f16_e64 v255, v2 -// GFX12: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_class_f16_e64 v255.l, v2.l +// GFX12: v_cmpx_class_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_class_f16_e64 s1, v2 -// GFX12: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] +v_cmpx_class_f16_e64 s1, v2.l +// GFX12: v_cmpx_class_f16_e64 s1, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] -v_cmpx_class_f16_e64 s105, v255 -// GFX12: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] +v_cmpx_class_f16_e64 s105, v255.l +// GFX12: v_cmpx_class_f16_e64 s105, v255.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] v_cmpx_class_f16_e64 vcc_lo, s2 // GFX12: v_cmpx_class_f16_e64 vcc_lo, s2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00] @@ -47,6 +47,15 @@ v_cmpx_class_f16_e64 src_scc, vcc_lo v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi // GFX12: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] +v_cmpx_class_f16_e64 v1.h, v2.h +// GFX12: v_cmpx_class_f16_e64 v1.h, v2.h ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_class_f16_e64 v255.h, v2.l +// GFX12: v_cmpx_class_f16_e64 v255.h, v2.l ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_class_f16_e64 s105, v255.h +// GFX12: v_cmpx_class_f16_e64 s105, v255.h ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] + v_cmpx_class_f32_e64 v1, v2 // GFX12: v_cmpx_class_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s index 2ffdf04ff886a5..d51b4e35b0484d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s @@ -2,53 +2,62 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_class_f16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] + +v_cmpx_class_f16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_class_f16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x11,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s index 05bce2e0e61f27..928443c1d19590 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s @@ -2,20 +2,29 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_class_f16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_class_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16_e64_dpp v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xea,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cmpx_class_f16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_class_f16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x11,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s index 1392b9b8112f9f..cf0581edf9e365 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s @@ -2,50 +2,62 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -v_cmpx_class_f16_e32 v1, v2 -// GFX12: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] +v_cmpx_class_f16 v1.l, v2.l +// GFX12: v_cmpx_class_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0xfa,0x7d] -v_cmpx_class_f16 v127, v2 -// GFX12: v_cmpx_class_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0xfa,0x7d] +v_cmpx_class_f16 v127.l, v2.l +// GFX12: v_cmpx_class_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0xfa,0x7d] -v_cmpx_class_f16 s1, v2 -// GFX12: v_cmpx_class_f16_e32 s1, v2 ; encoding: [0x01,0x04,0xfa,0x7d] +v_cmpx_class_f16 s1, v2.l +// GFX12: v_cmpx_class_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0xfa,0x7d] -v_cmpx_class_f16 s105, v2 -// GFX12: v_cmpx_class_f16_e32 s105, v2 ; encoding: [0x69,0x04,0xfa,0x7d] +v_cmpx_class_f16 s105, v2.l +// GFX12: v_cmpx_class_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0xfa,0x7d] -v_cmpx_class_f16 vcc_lo, v2 -// GFX12: v_cmpx_class_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0xfa,0x7d] +v_cmpx_class_f16 vcc_lo, v2.l +// GFX12: v_cmpx_class_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0xfa,0x7d] -v_cmpx_class_f16 vcc_hi, v2 -// GFX12: v_cmpx_class_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0xfa,0x7d] +v_cmpx_class_f16 vcc_hi, v2.l +// GFX12: v_cmpx_class_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0xfa,0x7d] -v_cmpx_class_f16 ttmp15, v2 -// GFX12: v_cmpx_class_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0xfa,0x7d] +v_cmpx_class_f16 ttmp15, v2.l +// GFX12: v_cmpx_class_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0xfa,0x7d] -v_cmpx_class_f16 m0, v2 -// GFX12: v_cmpx_class_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0xfa,0x7d] +v_cmpx_class_f16 m0, v2.l +// GFX12: v_cmpx_class_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0xfa,0x7d] -v_cmpx_class_f16 exec_lo, v2 -// GFX12: v_cmpx_class_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7d] +v_cmpx_class_f16 exec_lo, v2.l +// GFX12: v_cmpx_class_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0xfa,0x7d] -v_cmpx_class_f16 exec_hi, v2 -// GFX12: v_cmpx_class_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7d] +v_cmpx_class_f16 exec_hi, v2.l +// GFX12: v_cmpx_class_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0xfa,0x7d] -v_cmpx_class_f16 null, v2 -// GFX12: v_cmpx_class_f16_e32 null, v2 ; encoding: [0x7c,0x04,0xfa,0x7d] +v_cmpx_class_f16 null, v2.l +// GFX12: v_cmpx_class_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0xfa,0x7d] -v_cmpx_class_f16 -1, v2 -// GFX12: v_cmpx_class_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0xfa,0x7d] +v_cmpx_class_f16 -1, v2.l +// GFX12: v_cmpx_class_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0xfa,0x7d] -v_cmpx_class_f16 0.5, v2 -// GFX12: v_cmpx_class_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0xfa,0x7d] +v_cmpx_class_f16 0.5, v2.l +// GFX12: v_cmpx_class_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0xfa,0x7d] -v_cmpx_class_f16 src_scc, v2 -// GFX12: v_cmpx_class_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7d] +v_cmpx_class_f16 src_scc, v2.l +// GFX12: v_cmpx_class_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0xfa,0x7d] -v_cmpx_class_f16 0xfe0b, v127 -// GFX12: v_cmpx_class_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_class_f16 0xfe0b, v127.l +// GFX12: v_cmpx_class_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_class_f16 v1.h, v2.l +// GFX12: v_cmpx_class_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0xfa,0x7d] + +v_cmpx_class_f16 v127.h, v2.l +// GFX12: v_cmpx_class_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0xfa,0x7d] + +v_cmpx_class_f16 src_scc, v2.h +// GFX12: v_cmpx_class_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0xfb,0x7d] + +v_cmpx_class_f16 0xfe0b, v127.h +// GFX12: v_cmpx_class_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_class_f32 v1, v2 // GFX12: v_cmpx_class_f32_e32 v1, v2 ; encoding: [0x01,0x05,0xfc,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s index c8f98351818373..97f56535364c75 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s @@ -2,47 +2,53 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_class_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_class_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_class_f16 v1, v2 row_mirror -// GFX12: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_class_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_class_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_class_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_class_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x35,0x30] +v_cmpx_class_f16 -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_class_f16 -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x35,0x30] + +v_cmpx_class_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_class_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0xfb,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_class_f16 -|v127.h|, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_class_f16 -|v127.h|, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x35,0x30] v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s index 3e7922d2acbda8..834c89dd30cddb 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s @@ -2,14 +2,20 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_class_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_class_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_class_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_class_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_class_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_class_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00] v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s index cb317443d28281..39afbf4d47be08 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s @@ -1,23 +1,41 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s -v_cmpx_class_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_class_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_class_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_class_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_class_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_class_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_cmpx_class_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction -v_cmpx_class_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction -v_cmpx_class_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_cmpx_class_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction v_cmpx_eq_f16_e32 v1, v255 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s index f3278c826475ad..ca84ac51f6dd86 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s @@ -1,23 +1,41 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 %s -v_cmpx_class_f16 v1, v255 -// GFX12: v_cmpx_class_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_class_f16 v1.h, v255.h +// GFX12: v_cmpx_class_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_class_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_class_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16 v255, v2 -// GFX12: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_class_f16 v1.l, v255.l +// GFX12: v_cmpx_class_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_class_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_class_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_class_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_class_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_class_f16 v255.h, v2.h +// GFX12: v_cmpx_class_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xfd,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_class_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_class_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_class_f16 v255.l, v2.l +// GFX12: v_cmpx_class_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_class_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_f16 v1, v255 // GFX12: v_cmpx_eq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt index ab5f0af5f66294..fea883471177f9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt @@ -5,46 +5,72 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x11,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x11,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt index 6867126e9c70e6..826374f8f830ae 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt @@ -5,10 +5,24 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x11,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x11,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] 0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt index b9d7a5296cc5e5..281eb66be5a184 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt @@ -5,19 +5,24 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00 -# GFX11: v_cmpx_class_f16_e64 v1, 0.5 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 v1.l, 0.5 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 v1, 0.5 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00] 0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00 -# GFX11: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00 -# GFX11: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 s1, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00 -# GFX11: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 s105, v255.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] 0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00 # GFX11: v_cmpx_class_f16_e64 vcc_lo, s2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00] @@ -52,6 +57,18 @@ 0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] +0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] + +0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00 +# GFX11-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] + 0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_class_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt index 913e7536275811..caabe1eecc1a06 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt @@ -5,49 +5,84 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 0x01,0x05,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] 0x7f,0x05,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0xfa,0x7d] 0x01,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 s1, v2 ; encoding: [0x01,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 s1, v2 ; encoding: [0x01,0x04,0xfa,0x7d] 0x69,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 s105, v2 ; encoding: [0x69,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 s105, v2 ; encoding: [0x69,0x04,0xfa,0x7d] 0x6a,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0xfa,0x7d] 0x6b,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0xfa,0x7d] 0x7b,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0xfa,0x7d] 0x7d,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0xfa,0x7d] 0x7e,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7d] 0x7f,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7d] 0x7c,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 null, v2 ; encoding: [0x7c,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 null, v2 ; encoding: [0x7c,0x04,0xfa,0x7d] 0xc1,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0xfa,0x7d] 0xf0,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0xfa,0x7d] 0xfd,0x04,0xfa,0x7d -# GFX11: v_cmpx_class_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7d] +# GFX11-REAL16: v_cmpx_class_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7d] 0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_class_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0xfa,0x7d +# GFX11-REAL16: v_cmpx_class_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0xfa,0x7d] + +0xff,0x05,0xfa,0x7d +# GFX11-REAL16: v_cmpx_class_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0xfa,0x7d] + +0xf0,0xfe,0xfa,0x7d +# GFX11-REAL16: v_cmpx_class_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0xfa,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0xfa,0x7d] + +0xfd,0x04,0xfb,0x7d +# GFX11-REAL16: v_cmpx_class_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0xfb,0x7d] +# GFX11-FAKE16: v_cmpx_class_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0xfb,0x7d] + +0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_class_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_class_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0xfc,0x7d # GFX11: v_cmpx_class_f32_e32 v1, v2 ; encoding: [0x01,0x05,0xfc,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt index 8919d86071f4dd..f660760cd9c51b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt @@ -5,46 +5,72 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30 -# GFX11: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_cmpx_class_f16 -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30] + +0xfa,0xfe,0xfa,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_class_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_class_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0xfb,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_class_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfb,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_class_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfb,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_cmpx_class_f16 -|v127.h|, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_cmpx_class_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x3d,0x30] 0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt index 867fd7374b7881..f32adeb61b16ee 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt @@ -5,10 +5,24 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_class_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0xfa,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_class_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0xfa,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_class_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0xfa,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_class_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_class_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_class_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_class_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt index 55e0bf6c525ecb..fae898c04b6cbb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt @@ -5,16 +5,20 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00 -# GFX12: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00 -# GFX12: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 s1, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00 -# GFX12: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 s105, v255.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] 0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00 # GFX12: v_cmpx_class_f16_e64 vcc_lo, s2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00] @@ -49,6 +53,18 @@ 0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] +0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] + +0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] + 0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_class_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt index 041e43f4d05e50..ffbdcba67ce18e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt @@ -5,49 +5,125 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] + +0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64 s1, v2.l ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] + +0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] + +0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 vcc_lo, s2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00] + +0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 vcc_hi, s105 ; encoding: [0x7e,0x00,0xfd,0xd4,0x6b,0xd2,0x00,0x00] + +0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 ttmp15, ttmp15 ; encoding: [0x7e,0x00,0xfd,0xd4,0x7b,0xf6,0x00,0x00] + +0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00 +# GFX12: v_cmpx_class_f16_e64 m0, src_scc ; encoding: [0x7e,0x00,0xfd,0xd4,0x7d,0xfa,0x01,0x00] + +0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_class_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xfd,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xfd,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xfd,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xfd,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 0.5, m0 ; encoding: [0x7e,0x00,0xfd,0xd4,0xf0,0xfa,0x00,0x00] + +0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xfd,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] + +0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x11,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x11,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt index 35e7a45a7b1625..ae945cbad54aea 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt @@ -5,13 +5,28 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x11,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e64_dpp -|v255.l|, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x11,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] 0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt index 9e5959ca4a77e8..ac83043628cb48 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt @@ -5,49 +5,80 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16 0x01,0x05,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] 0x7f,0x05,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0xfa,0x7d] 0x01,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 s1, v2 ; encoding: [0x01,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 s1, v2 ; encoding: [0x01,0x04,0xfa,0x7d] 0x69,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 s105, v2 ; encoding: [0x69,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 s105, v2 ; encoding: [0x69,0x04,0xfa,0x7d] 0x6a,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0xfa,0x7d] 0x6b,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0xfa,0x7d] 0x7b,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0xfa,0x7d] 0x7d,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0xfa,0x7d] 0x7e,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0xfa,0x7d] 0x7f,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0xfa,0x7d] 0x7c,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 null, v2 ; encoding: [0x7c,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 null, v2 ; encoding: [0x7c,0x04,0xfa,0x7d] 0xc1,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0xfa,0x7d] 0xf0,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0xfa,0x7d] 0xfd,0x04,0xfa,0x7d -# GFX12: v_cmpx_class_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7d] +# GFX12-REAL16: v_cmpx_class_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0xfa,0x7d] 0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_class_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfa,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0xfa,0x7d +# GFX12-REAL16: v_cmpx_class_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0xfa,0x7d] + +0xff,0x05,0xfa,0x7d +# GFX12-REAL16: v_cmpx_class_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0xfa,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0xfa,0x7d] + +0xfd,0x04,0xfb,0x7d +# GFX12-REAL16: v_cmpx_class_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0xfb,0x7d] +# GFX12-FAKE16: v_cmpx_class_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0xfb,0x7d] + +0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_class_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_class_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0xfb,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0xfc,0x7d # GFX12: v_cmpx_class_f32_e32 v1, v2 ; encoding: [0x01,0x05,0xfc,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt index 8ecef5536ad798..0db4a98489683f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt @@ -5,46 +5,68 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30 -# GFX12: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_cmpx_class_f16 -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_cmpx_class_f16 -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfa,0x7d,0x7f,0x6f,0x3d,0x30] + +0xfa,0x04,0xfb,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_class_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfb,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_class_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0xfb,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_cmpx_class_f16 -|v127.h|, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_cmpx_class_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfb,0x7d,0xff,0x6f,0x3d,0x30] 0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt index 147084df5384fd..9bcc6a89ff5d8e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt @@ -5,10 +5,20 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_class_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_class_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfa,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_class_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_class_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfb,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_class_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_class_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfb,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05] From 18e9d3dbe5386dd3b88df7e3352f81498cfa6182 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 22 Jan 2025 15:57:16 -0500 Subject: [PATCH 037/208] [AMDGPU][True16][MC] true16 for v_cmpx_xx_u/i16 (#123424) A bulk commit of true16 support for v_cmp_xx_i/u16 instructions including: v_cmpx_lt_i16 v_cmpx_eq_i16 v_cmpx_le_i16 v_cmpx_gt_i16 v_cmpx_ne_i16 v_cmpx_ge_i16 v_cmpx_lt_u16 v_cmpx_eq_u16 v_cmpx_le_u16 v_cmpx_gt_u16 v_cmpx_ne_u16 v_cmpx_ge_u16 --- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 24 +- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s | 780 +++++----- .../AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s | 252 +++- .../MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s | 168 ++- llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s | 900 +++++++----- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s | 780 +++++----- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s | 252 +++- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s | 504 +++++-- .../MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s | 504 +++++-- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s | 168 ++- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s | 876 ++++++----- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s | 348 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s | 864 ++++++----- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s | 744 +++++----- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s | 216 ++- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s | 504 +++++-- .../MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s | 504 +++++-- .../gfx11_dasm_vop3_dpp16_from_vopcx.txt | 648 ++++++--- .../gfx11_dasm_vop3_dpp8_from_vopcx.txt | 216 ++- .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt | 168 ++- .../Disassembler/AMDGPU/gfx11_dasm_vopcx.txt | 780 +++++++--- .../AMDGPU/gfx11_dasm_vopcx_dpp16.txt | 648 ++++++--- .../AMDGPU/gfx11_dasm_vopcx_dpp8.txt | 288 +++- .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt | 192 ++- .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt | 1284 ++++++++++++++--- .../AMDGPU/gfx12_dasm_vop3cx_dpp8.txt | 288 +++- .../Disassembler/AMDGPU/gfx12_dasm_vopcx.txt | 732 +++++++--- .../AMDGPU/gfx12_dasm_vopcx_dpp16.txt | 600 +++++--- .../AMDGPU/gfx12_dasm_vopcx_dpp8.txt | 168 ++- 29 files changed, 10068 insertions(+), 4332 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 0f80271686d4a6..aa930249c50035 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -2026,18 +2026,18 @@ defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11_gfx12<0x0ad>; defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11_gfx12<0x0ae>; defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">; -defm V_CMPX_LT_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; -defm V_CMPX_EQ_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; -defm V_CMPX_LE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; -defm V_CMPX_GT_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">; -defm V_CMPX_NE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">; -defm V_CMPX_GE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">; -defm V_CMPX_LT_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">; -defm V_CMPX_EQ_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">; -defm V_CMPX_LE_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; -defm V_CMPX_GT_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; -defm V_CMPX_NE_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; -defm V_CMPX_GE_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; +defm V_CMPX_LT_I16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; +defm V_CMPX_EQ_I16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; +defm V_CMPX_LE_I16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; +defm V_CMPX_GT_I16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">; +defm V_CMPX_NE_I16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">; +defm V_CMPX_GE_I16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">; +defm V_CMPX_LT_U16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">; +defm V_CMPX_EQ_U16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">; +defm V_CMPX_LE_U16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; +defm V_CMPX_GT_U16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; +defm V_CMPX_NE_U16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; +defm V_CMPX_GE_U16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; defm V_CMPX_F_I32 : VOPCX_Real_gfx11<0x0c0>; defm V_CMPX_LT_I32 : VOPCX_Real_gfx11_gfx12<0x0c1>; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s index 379142e84aabdb..80264a4a791bbc 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s @@ -180,47 +180,56 @@ v_cmpx_eq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_eq_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_eq_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -264,47 +273,56 @@ v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_eq_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_eq_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_eq_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_eq_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -600,47 +618,56 @@ v_cmpx_ge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ge_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ge_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ge_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -684,47 +711,56 @@ v_cmpx_ge_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ge_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_ge_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -852,47 +888,56 @@ v_cmpx_gt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_gt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_gt_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_gt_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_gt_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -936,47 +981,56 @@ v_cmpx_gt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_gt_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_gt_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_gt_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1104,47 +1158,56 @@ v_cmpx_le_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_le_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_le_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_le_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_le_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1188,47 +1251,56 @@ v_cmpx_le_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_le_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_le_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_le_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_le_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1449,47 +1521,56 @@ v_cmpx_lt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_lt_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_lt_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_lt_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1533,47 +1614,56 @@ v_cmpx_lt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_lt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_lt_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1617,47 +1707,56 @@ v_cmpx_lt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ne_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ne_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ne_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_ne_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1701,47 +1800,56 @@ v_cmpx_ne_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ne_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ne_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ne_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s index 4d6928ecbbc767..119e4826b32774 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s @@ -53,14 +53,23 @@ v_cmpx_eq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x92,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -71,14 +80,23 @@ v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_eq_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -155,14 +173,23 @@ v_cmpx_ge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x96,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_ge_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ge_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -173,14 +200,23 @@ v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -215,14 +251,23 @@ v_cmpx_gt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x94,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_gt_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -233,14 +278,23 @@ v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_gt_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_gt_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -275,14 +329,23 @@ v_cmpx_le_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x93,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_le_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -293,14 +356,23 @@ v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_le_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -368,14 +440,23 @@ v_cmpx_lt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x91,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_lt_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_lt_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -386,14 +467,23 @@ v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -404,14 +494,23 @@ v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ne_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -422,14 +521,23 @@ v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_ne_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ne_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s index 0d8dc8b1bbc8b4..1614f00e1f07e5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s @@ -275,11 +275,11 @@ v_cmpx_eq_f64_e64 -|src_scc|, -|exec| v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_eq_i16_e64 v1, v2 -// GFX11: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_eq_i16_e64 v1.l, v2.l +// GFX11: v_cmpx_eq_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_eq_i16_e64 v255, v255 -// GFX11: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_eq_i16_e64 v255.l, v255.l +// GFX11: v_cmpx_eq_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_i16_e64 s1, s2 // GFX11: v_cmpx_eq_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00] @@ -320,6 +320,12 @@ v_cmpx_eq_i16_e64 src_scc, vcc_lo v_cmpx_eq_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_i16_e64 v1.h, v2.l +// GFX11: v_cmpx_eq_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_eq_i16_e64 v255.l, v255.h +// GFX11: v_cmpx_eq_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_eq_i32_e64 v1, v2 // GFX11: v_cmpx_eq_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00] @@ -401,11 +407,11 @@ v_cmpx_eq_i64_e64 src_scc, exec v_cmpx_eq_i64_e64 0xaf123456, vcc // GFX11: v_cmpx_eq_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_eq_u16_e64 v1, v2 -// GFX11: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_eq_u16_e64 v1.l, v2.l +// GFX11: v_cmpx_eq_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_eq_u16_e64 v255, v255 -// GFX11: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_eq_u16_e64 v255.l, v255.l +// GFX11: v_cmpx_eq_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_u16_e64 s1, s2 // GFX11: v_cmpx_eq_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00] @@ -446,6 +452,12 @@ v_cmpx_eq_u16_e64 src_scc, vcc_lo v_cmpx_eq_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_u16_e64 v1.h, v2.l +// GFX11: v_cmpx_eq_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_eq_u16_e64 v255.l, v255.h +// GFX11: v_cmpx_eq_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_eq_u32_e64 v1, v2 // GFX11: v_cmpx_eq_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00] @@ -941,11 +953,11 @@ v_cmpx_ge_f64_e64 -|src_scc|, -|exec| v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_ge_i16_e64 v1, v2 -// GFX11: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ge_i16_e64 v1.l, v2.l +// GFX11: v_cmpx_ge_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ge_i16_e64 v255, v255 -// GFX11: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ge_i16_e64 v255.l, v255.l +// GFX11: v_cmpx_ge_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_i16_e64 s1, s2 // GFX11: v_cmpx_ge_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00] @@ -986,6 +998,12 @@ v_cmpx_ge_i16_e64 src_scc, vcc_lo v_cmpx_ge_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_i16_e64 v1.h, v2.l +// GFX11: v_cmpx_ge_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ge_i16_e64 v255.l, v255.h +// GFX11: v_cmpx_ge_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ge_i32_e64 v1, v2 // GFX11: v_cmpx_ge_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00] @@ -1067,11 +1085,11 @@ v_cmpx_ge_i64_e64 src_scc, exec v_cmpx_ge_i64_e64 0xaf123456, vcc // GFX11: v_cmpx_ge_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ge_u16_e64 v1, v2 -// GFX11: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ge_u16_e64 v1.l, v2.l +// GFX11: v_cmpx_ge_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ge_u16_e64 v255, v255 -// GFX11: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ge_u16_e64 v255.l, v255.l +// GFX11: v_cmpx_ge_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_u16_e64 s1, s2 // GFX11: v_cmpx_ge_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00] @@ -1112,6 +1130,12 @@ v_cmpx_ge_u16_e64 src_scc, vcc_lo v_cmpx_ge_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_u16_e64 v1.h, v2.l +// GFX11: v_cmpx_ge_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ge_u16_e64 v255.l, v255.h +// GFX11: v_cmpx_ge_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ge_u32_e64 v1, v2 // GFX11: v_cmpx_ge_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00] @@ -1319,11 +1343,11 @@ v_cmpx_gt_f64_e64 -|src_scc|, -|exec| v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_gt_i16_e64 v1, v2 -// GFX11: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_gt_i16_e64 v1.l, v2.l +// GFX11: v_cmpx_gt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_gt_i16_e64 v255, v255 -// GFX11: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_gt_i16_e64 v255.l, v255.l +// GFX11: v_cmpx_gt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_i16_e64 s1, s2 // GFX11: v_cmpx_gt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00] @@ -1364,6 +1388,12 @@ v_cmpx_gt_i16_e64 src_scc, vcc_lo v_cmpx_gt_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_i16_e64 v1.h, v2.l +// GFX11: v_cmpx_gt_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_gt_i16_e64 v255.l, v255.h +// GFX11: v_cmpx_gt_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_gt_i32_e64 v1, v2 // GFX11: v_cmpx_gt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00] @@ -1445,11 +1475,11 @@ v_cmpx_gt_i64_e64 src_scc, exec v_cmpx_gt_i64_e64 0xaf123456, vcc // GFX11: v_cmpx_gt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_gt_u16_e64 v1, v2 -// GFX11: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_gt_u16_e64 v1.l, v2.l +// GFX11: v_cmpx_gt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_gt_u16_e64 v255, v255 -// GFX11: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_gt_u16_e64 v255.l, v255.l +// GFX11: v_cmpx_gt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_u16_e64 s1, s2 // GFX11: v_cmpx_gt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00] @@ -1490,6 +1520,12 @@ v_cmpx_gt_u16_e64 src_scc, vcc_lo v_cmpx_gt_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_u16_e64 v1.h, v2.l +// GFX11: v_cmpx_gt_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_gt_u16_e64 v255.l, v255.h +// GFX11: v_cmpx_gt_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_gt_u32_e64 v1, v2 // GFX11: v_cmpx_gt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00] @@ -1697,11 +1733,11 @@ v_cmpx_le_f64_e64 -|src_scc|, -|exec| v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_le_i16_e64 v1, v2 -// GFX11: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_le_i16_e64 v1.l, v2.l +// GFX11: v_cmpx_le_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_le_i16_e64 v255, v255 -// GFX11: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_le_i16_e64 v255.l, v255.l +// GFX11: v_cmpx_le_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_i16_e64 s1, s2 // GFX11: v_cmpx_le_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00] @@ -1742,6 +1778,12 @@ v_cmpx_le_i16_e64 src_scc, vcc_lo v_cmpx_le_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_le_i16_e64 v1.h, v2.l +// GFX11: v_cmpx_le_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_le_i16_e64 v255.l, v255.h +// GFX11: v_cmpx_le_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_le_i32_e64 v1, v2 // GFX11: v_cmpx_le_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00] @@ -1823,11 +1865,11 @@ v_cmpx_le_i64_e64 src_scc, exec v_cmpx_le_i64_e64 0xaf123456, vcc // GFX11: v_cmpx_le_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_le_u16_e64 v1, v2 -// GFX11: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_le_u16_e64 v1.l, v2.l +// GFX11: v_cmpx_le_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_le_u16_e64 v255, v255 -// GFX11: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_le_u16_e64 v255.l, v255.l +// GFX11: v_cmpx_le_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_u16_e64 s1, s2 // GFX11: v_cmpx_le_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00] @@ -1868,6 +1910,12 @@ v_cmpx_le_u16_e64 src_scc, vcc_lo v_cmpx_le_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_le_u16_e64 v1.h, v2.l +// GFX11: v_cmpx_le_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_le_u16_e64 v255.l, v255.h +// GFX11: v_cmpx_le_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_le_u32_e64 v1, v2 // GFX11: v_cmpx_le_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00] @@ -2207,11 +2255,11 @@ v_cmpx_lt_f64_e64 -|src_scc|, -|exec| v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_lt_i16_e64 v1, v2 -// GFX11: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_lt_i16_e64 v1.l, v2.l +// GFX11: v_cmpx_lt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_lt_i16_e64 v255, v255 -// GFX11: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_lt_i16_e64 v255.l, v255.l +// GFX11: v_cmpx_lt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_i16_e64 s1, s2 // GFX11: v_cmpx_lt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00] @@ -2252,6 +2300,12 @@ v_cmpx_lt_i16_e64 src_scc, vcc_lo v_cmpx_lt_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_i16_e64 v1.h, v2.l +// GFX11: v_cmpx_lt_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_lt_i16_e64 v255.l, v255.h +// GFX11: v_cmpx_lt_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_lt_i32_e64 v1, v2 // GFX11: v_cmpx_lt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00] @@ -2333,11 +2387,11 @@ v_cmpx_lt_i64_e64 src_scc, exec v_cmpx_lt_i64_e64 0xaf123456, vcc // GFX11: v_cmpx_lt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_lt_u16_e64 v1, v2 -// GFX11: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_lt_u16_e64 v1.l, v2.l +// GFX11: v_cmpx_lt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_lt_u16_e64 v255, v255 -// GFX11: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_lt_u16_e64 v255.l, v255.l +// GFX11: v_cmpx_lt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_u16_e64 s1, s2 // GFX11: v_cmpx_lt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00] @@ -2378,6 +2432,12 @@ v_cmpx_lt_u16_e64 src_scc, vcc_lo v_cmpx_lt_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_u16_e64 v1.h, v2.l +// GFX11: v_cmpx_lt_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_lt_u16_e64 v255.l, v255.h +// GFX11: v_cmpx_lt_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_lt_u32_e64 v1, v2 // GFX11: v_cmpx_lt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00] @@ -2459,11 +2519,11 @@ v_cmpx_lt_u64_e64 src_scc, exec v_cmpx_lt_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_lt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ne_i16_e64 v1, v2 -// GFX11: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ne_i16_e64 v1.l, v2.l +// GFX11: v_cmpx_ne_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ne_i16_e64 v255, v255 -// GFX11: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ne_i16_e64 v255.l, v255.l +// GFX11: v_cmpx_ne_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_i16_e64 s1, s2 // GFX11: v_cmpx_ne_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00] @@ -2504,6 +2564,12 @@ v_cmpx_ne_i16_e64 src_scc, vcc_lo v_cmpx_ne_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_i16_e64 v1.h, v2.l +// GFX11: v_cmpx_ne_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ne_i16_e64 v255.l, v255.h +// GFX11: v_cmpx_ne_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ne_i32_e64 v1, v2 // GFX11: v_cmpx_ne_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00] @@ -2585,11 +2651,11 @@ v_cmpx_ne_i64_e64 src_scc, exec v_cmpx_ne_i64_e64 0xaf123456, vcc // GFX11: v_cmpx_ne_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ne_u16_e64 v1, v2 -// GFX11: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ne_u16_e64 v1.l, v2.l +// GFX11: v_cmpx_ne_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ne_u16_e64 v255, v255 -// GFX11: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ne_u16_e64 v255.l, v255.l +// GFX11: v_cmpx_ne_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_u16_e64 s1, s2 // GFX11: v_cmpx_ne_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00] @@ -2630,6 +2696,12 @@ v_cmpx_ne_u16_e64 src_scc, vcc_lo v_cmpx_ne_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_u16_e64 v1.h, v2.l +// GFX11: v_cmpx_ne_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ne_u16_e64 v255.l, v255.h +// GFX11: v_cmpx_ne_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ne_u32_e64 v1, v2 // GFX11: v_cmpx_ne_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s index d3eff378e630f2..cdad89321d89a1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s @@ -269,50 +269,65 @@ v_cmpx_eq_f64 src_scc, v[2:3] v_cmpx_eq_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_eq_i16 v1, v2 -// GFX11: v_cmpx_eq_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x64,0x7d] +v_cmpx_eq_i16 v1.l, v2.l +// GFX11: v_cmpx_eq_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x64,0x7d] -v_cmpx_eq_i16 v127, v2 -// GFX11: v_cmpx_eq_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x64,0x7d] +v_cmpx_eq_i16 v127.l, v2.l +// GFX11: v_cmpx_eq_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x64,0x7d] -v_cmpx_eq_i16 s1, v2 -// GFX11: v_cmpx_eq_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x64,0x7d] +v_cmpx_eq_i16 s1, v2.l +// GFX11: v_cmpx_eq_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x64,0x7d] -v_cmpx_eq_i16 s105, v2 -// GFX11: v_cmpx_eq_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x64,0x7d] +v_cmpx_eq_i16 s105, v2.l +// GFX11: v_cmpx_eq_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x64,0x7d] -v_cmpx_eq_i16 vcc_lo, v2 -// GFX11: v_cmpx_eq_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x64,0x7d] +v_cmpx_eq_i16 vcc_lo, v2.l +// GFX11: v_cmpx_eq_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x64,0x7d] -v_cmpx_eq_i16 vcc_hi, v2 -// GFX11: v_cmpx_eq_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x64,0x7d] +v_cmpx_eq_i16 vcc_hi, v2.l +// GFX11: v_cmpx_eq_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x64,0x7d] -v_cmpx_eq_i16 ttmp15, v2 -// GFX11: v_cmpx_eq_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x64,0x7d] +v_cmpx_eq_i16 ttmp15, v2.l +// GFX11: v_cmpx_eq_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x64,0x7d] -v_cmpx_eq_i16 m0, v2 -// GFX11: v_cmpx_eq_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x64,0x7d] +v_cmpx_eq_i16 m0, v2.l +// GFX11: v_cmpx_eq_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x64,0x7d] -v_cmpx_eq_i16 exec_lo, v2 -// GFX11: v_cmpx_eq_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x64,0x7d] +v_cmpx_eq_i16 exec_lo, v2.l +// GFX11: v_cmpx_eq_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x64,0x7d] -v_cmpx_eq_i16 exec_hi, v2 -// GFX11: v_cmpx_eq_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x64,0x7d] +v_cmpx_eq_i16 exec_hi, v2.l +// GFX11: v_cmpx_eq_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x64,0x7d] -v_cmpx_eq_i16 null, v2 -// GFX11: v_cmpx_eq_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x64,0x7d] +v_cmpx_eq_i16 null, v2.l +// GFX11: v_cmpx_eq_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x64,0x7d] -v_cmpx_eq_i16 -1, v2 -// GFX11: v_cmpx_eq_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x64,0x7d] +v_cmpx_eq_i16 -1, v2.l +// GFX11: v_cmpx_eq_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x64,0x7d] -v_cmpx_eq_i16 0.5, v2 -// GFX11: v_cmpx_eq_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x64,0x7d] +v_cmpx_eq_i16 0.5, v2.l +// GFX11: v_cmpx_eq_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x64,0x7d] -v_cmpx_eq_i16 src_scc, v2 -// GFX11: v_cmpx_eq_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x64,0x7d] +v_cmpx_eq_i16 src_scc, v2.l +// GFX11: v_cmpx_eq_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x64,0x7d] -v_cmpx_eq_i16 0xfe0b, v127 -// GFX11: v_cmpx_eq_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_i16 0xfe0b, v127.l +// GFX11: v_cmpx_eq_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_eq_i16 v1.h, v2.l +// GFX11: v_cmpx_eq_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x64,0x7d] + +v_cmpx_eq_i16 v127.h, v2.l +// GFX11: v_cmpx_eq_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x64,0x7d] + +v_cmpx_eq_i16 0.5, v127.l +// GFX11: v_cmpx_eq_i16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x64,0x7d] + +v_cmpx_eq_i16 src_scc, v2.h +// GFX11: v_cmpx_eq_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x65,0x7d] + +v_cmpx_eq_i16 0xfe0b, v127.h +// GFX11: v_cmpx_eq_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_eq_i32 v1, v2 // GFX11: v_cmpx_eq_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x84,0x7d] @@ -395,50 +410,65 @@ v_cmpx_eq_i64 src_scc, v[2:3] v_cmpx_eq_i64 0xaf123456, v[254:255] // GFX11: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_eq_u16 v1, v2 -// GFX11: v_cmpx_eq_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x74,0x7d] +v_cmpx_eq_u16 v1.l, v2.l +// GFX11: v_cmpx_eq_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x74,0x7d] + +v_cmpx_eq_u16 v127.l, v2.l +// GFX11: v_cmpx_eq_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x74,0x7d] + +v_cmpx_eq_u16 s1, v2.l +// GFX11: v_cmpx_eq_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x74,0x7d] + +v_cmpx_eq_u16 s105, v2.l +// GFX11: v_cmpx_eq_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x74,0x7d] -v_cmpx_eq_u16 v127, v2 -// GFX11: v_cmpx_eq_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x74,0x7d] +v_cmpx_eq_u16 vcc_lo, v2.l +// GFX11: v_cmpx_eq_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x74,0x7d] -v_cmpx_eq_u16 s1, v2 -// GFX11: v_cmpx_eq_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x74,0x7d] +v_cmpx_eq_u16 vcc_hi, v2.l +// GFX11: v_cmpx_eq_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x74,0x7d] -v_cmpx_eq_u16 s105, v2 -// GFX11: v_cmpx_eq_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x74,0x7d] +v_cmpx_eq_u16 ttmp15, v2.l +// GFX11: v_cmpx_eq_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x74,0x7d] -v_cmpx_eq_u16 vcc_lo, v2 -// GFX11: v_cmpx_eq_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x74,0x7d] +v_cmpx_eq_u16 m0, v2.l +// GFX11: v_cmpx_eq_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x74,0x7d] -v_cmpx_eq_u16 vcc_hi, v2 -// GFX11: v_cmpx_eq_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x74,0x7d] +v_cmpx_eq_u16 exec_lo, v2.l +// GFX11: v_cmpx_eq_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x74,0x7d] -v_cmpx_eq_u16 ttmp15, v2 -// GFX11: v_cmpx_eq_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x74,0x7d] +v_cmpx_eq_u16 exec_hi, v2.l +// GFX11: v_cmpx_eq_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x74,0x7d] -v_cmpx_eq_u16 m0, v2 -// GFX11: v_cmpx_eq_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x74,0x7d] +v_cmpx_eq_u16 null, v2.l +// GFX11: v_cmpx_eq_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x74,0x7d] -v_cmpx_eq_u16 exec_lo, v2 -// GFX11: v_cmpx_eq_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x74,0x7d] +v_cmpx_eq_u16 -1, v2.l +// GFX11: v_cmpx_eq_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x74,0x7d] -v_cmpx_eq_u16 exec_hi, v2 -// GFX11: v_cmpx_eq_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x74,0x7d] +v_cmpx_eq_u16 0.5, v2.l +// GFX11: v_cmpx_eq_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x74,0x7d] -v_cmpx_eq_u16 null, v2 -// GFX11: v_cmpx_eq_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x74,0x7d] +v_cmpx_eq_u16 src_scc, v2.l +// GFX11: v_cmpx_eq_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x74,0x7d] -v_cmpx_eq_u16 -1, v2 -// GFX11: v_cmpx_eq_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x74,0x7d] +v_cmpx_eq_u16 0xfe0b, v127.l +// GFX11: v_cmpx_eq_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_eq_u16 0.5, v2 -// GFX11: v_cmpx_eq_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x74,0x7d] +v_cmpx_eq_u16 v1.h, v2.l +// GFX11: v_cmpx_eq_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x74,0x7d] -v_cmpx_eq_u16 src_scc, v2 -// GFX11: v_cmpx_eq_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x74,0x7d] +v_cmpx_eq_u16 v127.h, v2.l +// GFX11: v_cmpx_eq_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x74,0x7d] -v_cmpx_eq_u16 0xfe0b, v127 -// GFX11: v_cmpx_eq_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_u16 0.5, v127.l +// GFX11: v_cmpx_eq_u16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x74,0x7d] + +v_cmpx_eq_u16 src_scc, v2.h +// GFX11: v_cmpx_eq_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x75,0x7d] + +v_cmpx_eq_u16 0xfe0b, v127.h +// GFX11: v_cmpx_eq_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_eq_u32 v1, v2 // GFX11: v_cmpx_eq_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x94,0x7d] @@ -935,50 +965,65 @@ v_cmpx_ge_f64 src_scc, v[2:3] v_cmpx_ge_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ge_i16 v1, v2 -// GFX11: v_cmpx_ge_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6c,0x7d] +v_cmpx_ge_i16 v1.l, v2.l +// GFX11: v_cmpx_ge_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6c,0x7d] + +v_cmpx_ge_i16 v127.l, v2.l +// GFX11: v_cmpx_ge_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6c,0x7d] + +v_cmpx_ge_i16 s1, v2.l +// GFX11: v_cmpx_ge_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6c,0x7d] + +v_cmpx_ge_i16 s105, v2.l +// GFX11: v_cmpx_ge_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6c,0x7d] + +v_cmpx_ge_i16 vcc_lo, v2.l +// GFX11: v_cmpx_ge_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6c,0x7d] + +v_cmpx_ge_i16 vcc_hi, v2.l +// GFX11: v_cmpx_ge_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6c,0x7d] -v_cmpx_ge_i16 v127, v2 -// GFX11: v_cmpx_ge_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6c,0x7d] +v_cmpx_ge_i16 ttmp15, v2.l +// GFX11: v_cmpx_ge_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6c,0x7d] -v_cmpx_ge_i16 s1, v2 -// GFX11: v_cmpx_ge_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6c,0x7d] +v_cmpx_ge_i16 m0, v2.l +// GFX11: v_cmpx_ge_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6c,0x7d] -v_cmpx_ge_i16 s105, v2 -// GFX11: v_cmpx_ge_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6c,0x7d] +v_cmpx_ge_i16 exec_lo, v2.l +// GFX11: v_cmpx_ge_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6c,0x7d] -v_cmpx_ge_i16 vcc_lo, v2 -// GFX11: v_cmpx_ge_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6c,0x7d] +v_cmpx_ge_i16 exec_hi, v2.l +// GFX11: v_cmpx_ge_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6c,0x7d] -v_cmpx_ge_i16 vcc_hi, v2 -// GFX11: v_cmpx_ge_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6c,0x7d] +v_cmpx_ge_i16 null, v2.l +// GFX11: v_cmpx_ge_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6c,0x7d] -v_cmpx_ge_i16 ttmp15, v2 -// GFX11: v_cmpx_ge_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6c,0x7d] +v_cmpx_ge_i16 -1, v2.l +// GFX11: v_cmpx_ge_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6c,0x7d] -v_cmpx_ge_i16 m0, v2 -// GFX11: v_cmpx_ge_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6c,0x7d] +v_cmpx_ge_i16 0.5, v2.l +// GFX11: v_cmpx_ge_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x6c,0x7d] -v_cmpx_ge_i16 exec_lo, v2 -// GFX11: v_cmpx_ge_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6c,0x7d] +v_cmpx_ge_i16 src_scc, v2.l +// GFX11: v_cmpx_ge_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6c,0x7d] -v_cmpx_ge_i16 exec_hi, v2 -// GFX11: v_cmpx_ge_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6c,0x7d] +v_cmpx_ge_i16 0xfe0b, v127.l +// GFX11: v_cmpx_ge_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ge_i16 null, v2 -// GFX11: v_cmpx_ge_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6c,0x7d] +v_cmpx_ge_i16 v1.h, v2.l +// GFX11: v_cmpx_ge_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6c,0x7d] -v_cmpx_ge_i16 -1, v2 -// GFX11: v_cmpx_ge_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6c,0x7d] +v_cmpx_ge_i16 v127.h, v2.l +// GFX11: v_cmpx_ge_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6c,0x7d] -v_cmpx_ge_i16 0.5, v2 -// GFX11: v_cmpx_ge_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x6c,0x7d] +v_cmpx_ge_i16 0.5, v127.l +// GFX11: v_cmpx_ge_i16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x6c,0x7d] -v_cmpx_ge_i16 src_scc, v2 -// GFX11: v_cmpx_ge_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6c,0x7d] +v_cmpx_ge_i16 src_scc, v2.h +// GFX11: v_cmpx_ge_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6d,0x7d] -v_cmpx_ge_i16 0xfe0b, v127 -// GFX11: v_cmpx_ge_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_i16 0xfe0b, v127.h +// GFX11: v_cmpx_ge_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ge_i32 v1, v2 // GFX11: v_cmpx_ge_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8c,0x7d] @@ -1061,50 +1106,65 @@ v_cmpx_ge_i64 src_scc, v[2:3] v_cmpx_ge_i64 0xaf123456, v[254:255] // GFX11: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ge_u16 v1, v2 -// GFX11: v_cmpx_ge_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7c,0x7d] +v_cmpx_ge_u16 v1.l, v2.l +// GFX11: v_cmpx_ge_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7c,0x7d] -v_cmpx_ge_u16 v127, v2 -// GFX11: v_cmpx_ge_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7c,0x7d] +v_cmpx_ge_u16 v127.l, v2.l +// GFX11: v_cmpx_ge_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7c,0x7d] -v_cmpx_ge_u16 s1, v2 -// GFX11: v_cmpx_ge_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7c,0x7d] +v_cmpx_ge_u16 s1, v2.l +// GFX11: v_cmpx_ge_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7c,0x7d] -v_cmpx_ge_u16 s105, v2 -// GFX11: v_cmpx_ge_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7c,0x7d] +v_cmpx_ge_u16 s105, v2.l +// GFX11: v_cmpx_ge_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7c,0x7d] -v_cmpx_ge_u16 vcc_lo, v2 -// GFX11: v_cmpx_ge_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7c,0x7d] +v_cmpx_ge_u16 vcc_lo, v2.l +// GFX11: v_cmpx_ge_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7c,0x7d] -v_cmpx_ge_u16 vcc_hi, v2 -// GFX11: v_cmpx_ge_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7c,0x7d] +v_cmpx_ge_u16 vcc_hi, v2.l +// GFX11: v_cmpx_ge_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7c,0x7d] -v_cmpx_ge_u16 ttmp15, v2 -// GFX11: v_cmpx_ge_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7c,0x7d] +v_cmpx_ge_u16 ttmp15, v2.l +// GFX11: v_cmpx_ge_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7c,0x7d] -v_cmpx_ge_u16 m0, v2 -// GFX11: v_cmpx_ge_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7c,0x7d] +v_cmpx_ge_u16 m0, v2.l +// GFX11: v_cmpx_ge_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7c,0x7d] -v_cmpx_ge_u16 exec_lo, v2 -// GFX11: v_cmpx_ge_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7c,0x7d] +v_cmpx_ge_u16 exec_lo, v2.l +// GFX11: v_cmpx_ge_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7c,0x7d] -v_cmpx_ge_u16 exec_hi, v2 -// GFX11: v_cmpx_ge_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7c,0x7d] +v_cmpx_ge_u16 exec_hi, v2.l +// GFX11: v_cmpx_ge_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7c,0x7d] -v_cmpx_ge_u16 null, v2 -// GFX11: v_cmpx_ge_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7c,0x7d] +v_cmpx_ge_u16 null, v2.l +// GFX11: v_cmpx_ge_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7c,0x7d] -v_cmpx_ge_u16 -1, v2 -// GFX11: v_cmpx_ge_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7c,0x7d] +v_cmpx_ge_u16 -1, v2.l +// GFX11: v_cmpx_ge_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7c,0x7d] -v_cmpx_ge_u16 0.5, v2 -// GFX11: v_cmpx_ge_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x7c,0x7d] +v_cmpx_ge_u16 0.5, v2.l +// GFX11: v_cmpx_ge_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x7c,0x7d] -v_cmpx_ge_u16 src_scc, v2 -// GFX11: v_cmpx_ge_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7c,0x7d] +v_cmpx_ge_u16 src_scc, v2.l +// GFX11: v_cmpx_ge_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7c,0x7d] -v_cmpx_ge_u16 0xfe0b, v127 -// GFX11: v_cmpx_ge_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_u16 0xfe0b, v127.l +// GFX11: v_cmpx_ge_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_ge_u16 v1.h, v2.l +// GFX11: v_cmpx_ge_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7c,0x7d] + +v_cmpx_ge_u16 v127.h, v2.l +// GFX11: v_cmpx_ge_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7c,0x7d] + +v_cmpx_ge_u16 0.5, v127.l +// GFX11: v_cmpx_ge_u16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x7c,0x7d] + +v_cmpx_ge_u16 src_scc, v2.h +// GFX11: v_cmpx_ge_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7d,0x7d] + +v_cmpx_ge_u16 0xfe0b, v127.h +// GFX11: v_cmpx_ge_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ge_u32 v1, v2 // GFX11: v_cmpx_ge_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9c,0x7d] @@ -1313,50 +1373,65 @@ v_cmpx_gt_f64 src_scc, v[2:3] v_cmpx_gt_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_gt_i16 v1, v2 -// GFX11: v_cmpx_gt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x68,0x7d] +v_cmpx_gt_i16 v1.l, v2.l +// GFX11: v_cmpx_gt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x68,0x7d] + +v_cmpx_gt_i16 v127.l, v2.l +// GFX11: v_cmpx_gt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x68,0x7d] + +v_cmpx_gt_i16 s1, v2.l +// GFX11: v_cmpx_gt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x68,0x7d] + +v_cmpx_gt_i16 s105, v2.l +// GFX11: v_cmpx_gt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x68,0x7d] -v_cmpx_gt_i16 v127, v2 -// GFX11: v_cmpx_gt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x68,0x7d] +v_cmpx_gt_i16 vcc_lo, v2.l +// GFX11: v_cmpx_gt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x68,0x7d] -v_cmpx_gt_i16 s1, v2 -// GFX11: v_cmpx_gt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x68,0x7d] +v_cmpx_gt_i16 vcc_hi, v2.l +// GFX11: v_cmpx_gt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x68,0x7d] -v_cmpx_gt_i16 s105, v2 -// GFX11: v_cmpx_gt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x68,0x7d] +v_cmpx_gt_i16 ttmp15, v2.l +// GFX11: v_cmpx_gt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x68,0x7d] -v_cmpx_gt_i16 vcc_lo, v2 -// GFX11: v_cmpx_gt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x68,0x7d] +v_cmpx_gt_i16 m0, v2.l +// GFX11: v_cmpx_gt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x68,0x7d] -v_cmpx_gt_i16 vcc_hi, v2 -// GFX11: v_cmpx_gt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x68,0x7d] +v_cmpx_gt_i16 exec_lo, v2.l +// GFX11: v_cmpx_gt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x68,0x7d] -v_cmpx_gt_i16 ttmp15, v2 -// GFX11: v_cmpx_gt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x68,0x7d] +v_cmpx_gt_i16 exec_hi, v2.l +// GFX11: v_cmpx_gt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x68,0x7d] -v_cmpx_gt_i16 m0, v2 -// GFX11: v_cmpx_gt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x68,0x7d] +v_cmpx_gt_i16 null, v2.l +// GFX11: v_cmpx_gt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x68,0x7d] -v_cmpx_gt_i16 exec_lo, v2 -// GFX11: v_cmpx_gt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x68,0x7d] +v_cmpx_gt_i16 -1, v2.l +// GFX11: v_cmpx_gt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x68,0x7d] -v_cmpx_gt_i16 exec_hi, v2 -// GFX11: v_cmpx_gt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x68,0x7d] +v_cmpx_gt_i16 0.5, v2.l +// GFX11: v_cmpx_gt_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x68,0x7d] -v_cmpx_gt_i16 null, v2 -// GFX11: v_cmpx_gt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x68,0x7d] +v_cmpx_gt_i16 src_scc, v2.l +// GFX11: v_cmpx_gt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x68,0x7d] -v_cmpx_gt_i16 -1, v2 -// GFX11: v_cmpx_gt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x68,0x7d] +v_cmpx_gt_i16 0xfe0b, v127.l +// GFX11: v_cmpx_gt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_gt_i16 0.5, v2 -// GFX11: v_cmpx_gt_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x68,0x7d] +v_cmpx_gt_i16 v1.h, v2.l +// GFX11: v_cmpx_gt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x68,0x7d] -v_cmpx_gt_i16 src_scc, v2 -// GFX11: v_cmpx_gt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x68,0x7d] +v_cmpx_gt_i16 v127.h, v2.l +// GFX11: v_cmpx_gt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x68,0x7d] -v_cmpx_gt_i16 0xfe0b, v127 -// GFX11: v_cmpx_gt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_i16 0.5, v127.l +// GFX11: v_cmpx_gt_i16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x68,0x7d] + +v_cmpx_gt_i16 src_scc, v2.h +// GFX11: v_cmpx_gt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x69,0x7d] + +v_cmpx_gt_i16 0xfe0b, v127.h +// GFX11: v_cmpx_gt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_gt_i32 v1, v2 // GFX11: v_cmpx_gt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x88,0x7d] @@ -1439,50 +1514,65 @@ v_cmpx_gt_i64 src_scc, v[2:3] v_cmpx_gt_i64 0xaf123456, v[254:255] // GFX11: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_gt_u16 v1, v2 -// GFX11: v_cmpx_gt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x78,0x7d] +v_cmpx_gt_u16 v1.l, v2.l +// GFX11: v_cmpx_gt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x78,0x7d] + +v_cmpx_gt_u16 v127.l, v2.l +// GFX11: v_cmpx_gt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x78,0x7d] + +v_cmpx_gt_u16 s1, v2.l +// GFX11: v_cmpx_gt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x78,0x7d] + +v_cmpx_gt_u16 s105, v2.l +// GFX11: v_cmpx_gt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x78,0x7d] + +v_cmpx_gt_u16 vcc_lo, v2.l +// GFX11: v_cmpx_gt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x78,0x7d] + +v_cmpx_gt_u16 vcc_hi, v2.l +// GFX11: v_cmpx_gt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x78,0x7d] -v_cmpx_gt_u16 v127, v2 -// GFX11: v_cmpx_gt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x78,0x7d] +v_cmpx_gt_u16 ttmp15, v2.l +// GFX11: v_cmpx_gt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x78,0x7d] -v_cmpx_gt_u16 s1, v2 -// GFX11: v_cmpx_gt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x78,0x7d] +v_cmpx_gt_u16 m0, v2.l +// GFX11: v_cmpx_gt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x78,0x7d] -v_cmpx_gt_u16 s105, v2 -// GFX11: v_cmpx_gt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x78,0x7d] +v_cmpx_gt_u16 exec_lo, v2.l +// GFX11: v_cmpx_gt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x78,0x7d] -v_cmpx_gt_u16 vcc_lo, v2 -// GFX11: v_cmpx_gt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x78,0x7d] +v_cmpx_gt_u16 exec_hi, v2.l +// GFX11: v_cmpx_gt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x78,0x7d] -v_cmpx_gt_u16 vcc_hi, v2 -// GFX11: v_cmpx_gt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x78,0x7d] +v_cmpx_gt_u16 null, v2.l +// GFX11: v_cmpx_gt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x78,0x7d] -v_cmpx_gt_u16 ttmp15, v2 -// GFX11: v_cmpx_gt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x78,0x7d] +v_cmpx_gt_u16 -1, v2.l +// GFX11: v_cmpx_gt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x78,0x7d] -v_cmpx_gt_u16 m0, v2 -// GFX11: v_cmpx_gt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x78,0x7d] +v_cmpx_gt_u16 0.5, v2.l +// GFX11: v_cmpx_gt_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x78,0x7d] -v_cmpx_gt_u16 exec_lo, v2 -// GFX11: v_cmpx_gt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x78,0x7d] +v_cmpx_gt_u16 src_scc, v2.l +// GFX11: v_cmpx_gt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x78,0x7d] -v_cmpx_gt_u16 exec_hi, v2 -// GFX11: v_cmpx_gt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x78,0x7d] +v_cmpx_gt_u16 0xfe0b, v127.l +// GFX11: v_cmpx_gt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_gt_u16 null, v2 -// GFX11: v_cmpx_gt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x78,0x7d] +v_cmpx_gt_u16 v1.h, v2.l +// GFX11: v_cmpx_gt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x78,0x7d] -v_cmpx_gt_u16 -1, v2 -// GFX11: v_cmpx_gt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x78,0x7d] +v_cmpx_gt_u16 v127.h, v2.l +// GFX11: v_cmpx_gt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x78,0x7d] -v_cmpx_gt_u16 0.5, v2 -// GFX11: v_cmpx_gt_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x78,0x7d] +v_cmpx_gt_u16 0.5, v127.l +// GFX11: v_cmpx_gt_u16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x78,0x7d] -v_cmpx_gt_u16 src_scc, v2 -// GFX11: v_cmpx_gt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x78,0x7d] +v_cmpx_gt_u16 src_scc, v2.h +// GFX11: v_cmpx_gt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x79,0x7d] -v_cmpx_gt_u16 0xfe0b, v127 -// GFX11: v_cmpx_gt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_u16 0xfe0b, v127.h +// GFX11: v_cmpx_gt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_gt_u32 v1, v2 // GFX11: v_cmpx_gt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x98,0x7d] @@ -1691,50 +1781,65 @@ v_cmpx_le_f64 src_scc, v[2:3] v_cmpx_le_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_le_i16 v1, v2 -// GFX11: v_cmpx_le_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x66,0x7d] +v_cmpx_le_i16 v1.l, v2.l +// GFX11: v_cmpx_le_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x66,0x7d] -v_cmpx_le_i16 v127, v2 -// GFX11: v_cmpx_le_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x66,0x7d] +v_cmpx_le_i16 v127.l, v2.l +// GFX11: v_cmpx_le_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x66,0x7d] -v_cmpx_le_i16 s1, v2 -// GFX11: v_cmpx_le_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x66,0x7d] +v_cmpx_le_i16 s1, v2.l +// GFX11: v_cmpx_le_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x66,0x7d] -v_cmpx_le_i16 s105, v2 -// GFX11: v_cmpx_le_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x66,0x7d] +v_cmpx_le_i16 s105, v2.l +// GFX11: v_cmpx_le_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x66,0x7d] -v_cmpx_le_i16 vcc_lo, v2 -// GFX11: v_cmpx_le_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x66,0x7d] +v_cmpx_le_i16 vcc_lo, v2.l +// GFX11: v_cmpx_le_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x66,0x7d] -v_cmpx_le_i16 vcc_hi, v2 -// GFX11: v_cmpx_le_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x66,0x7d] +v_cmpx_le_i16 vcc_hi, v2.l +// GFX11: v_cmpx_le_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x66,0x7d] -v_cmpx_le_i16 ttmp15, v2 -// GFX11: v_cmpx_le_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x66,0x7d] +v_cmpx_le_i16 ttmp15, v2.l +// GFX11: v_cmpx_le_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x66,0x7d] -v_cmpx_le_i16 m0, v2 -// GFX11: v_cmpx_le_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x66,0x7d] +v_cmpx_le_i16 m0, v2.l +// GFX11: v_cmpx_le_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x66,0x7d] -v_cmpx_le_i16 exec_lo, v2 -// GFX11: v_cmpx_le_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x66,0x7d] +v_cmpx_le_i16 exec_lo, v2.l +// GFX11: v_cmpx_le_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x66,0x7d] -v_cmpx_le_i16 exec_hi, v2 -// GFX11: v_cmpx_le_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x66,0x7d] +v_cmpx_le_i16 exec_hi, v2.l +// GFX11: v_cmpx_le_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x66,0x7d] -v_cmpx_le_i16 null, v2 -// GFX11: v_cmpx_le_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x66,0x7d] +v_cmpx_le_i16 null, v2.l +// GFX11: v_cmpx_le_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x66,0x7d] -v_cmpx_le_i16 -1, v2 -// GFX11: v_cmpx_le_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x66,0x7d] +v_cmpx_le_i16 -1, v2.l +// GFX11: v_cmpx_le_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x66,0x7d] -v_cmpx_le_i16 0.5, v2 -// GFX11: v_cmpx_le_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x66,0x7d] +v_cmpx_le_i16 0.5, v2.l +// GFX11: v_cmpx_le_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x66,0x7d] -v_cmpx_le_i16 src_scc, v2 -// GFX11: v_cmpx_le_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x66,0x7d] +v_cmpx_le_i16 src_scc, v2.l +// GFX11: v_cmpx_le_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x66,0x7d] -v_cmpx_le_i16 0xfe0b, v127 -// GFX11: v_cmpx_le_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_le_i16 0xfe0b, v127.l +// GFX11: v_cmpx_le_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_le_i16 v1.h, v2.l +// GFX11: v_cmpx_le_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x66,0x7d] + +v_cmpx_le_i16 v127.h, v2.l +// GFX11: v_cmpx_le_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x66,0x7d] + +v_cmpx_le_i16 0.5, v127.l +// GFX11: v_cmpx_le_i16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x66,0x7d] + +v_cmpx_le_i16 src_scc, v2.h +// GFX11: v_cmpx_le_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x67,0x7d] + +v_cmpx_le_i16 0xfe0b, v127.h +// GFX11: v_cmpx_le_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_le_i32 v1, v2 // GFX11: v_cmpx_le_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x86,0x7d] @@ -1817,50 +1922,65 @@ v_cmpx_le_i64 src_scc, v[2:3] v_cmpx_le_i64 0xaf123456, v[254:255] // GFX11: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_le_u16 v1, v2 -// GFX11: v_cmpx_le_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x76,0x7d] +v_cmpx_le_u16 v1.l, v2.l +// GFX11: v_cmpx_le_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x76,0x7d] + +v_cmpx_le_u16 v127.l, v2.l +// GFX11: v_cmpx_le_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x76,0x7d] + +v_cmpx_le_u16 s1, v2.l +// GFX11: v_cmpx_le_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x76,0x7d] + +v_cmpx_le_u16 s105, v2.l +// GFX11: v_cmpx_le_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x76,0x7d] -v_cmpx_le_u16 v127, v2 -// GFX11: v_cmpx_le_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x76,0x7d] +v_cmpx_le_u16 vcc_lo, v2.l +// GFX11: v_cmpx_le_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x76,0x7d] -v_cmpx_le_u16 s1, v2 -// GFX11: v_cmpx_le_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x76,0x7d] +v_cmpx_le_u16 vcc_hi, v2.l +// GFX11: v_cmpx_le_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x76,0x7d] -v_cmpx_le_u16 s105, v2 -// GFX11: v_cmpx_le_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x76,0x7d] +v_cmpx_le_u16 ttmp15, v2.l +// GFX11: v_cmpx_le_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x76,0x7d] -v_cmpx_le_u16 vcc_lo, v2 -// GFX11: v_cmpx_le_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x76,0x7d] +v_cmpx_le_u16 m0, v2.l +// GFX11: v_cmpx_le_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x76,0x7d] -v_cmpx_le_u16 vcc_hi, v2 -// GFX11: v_cmpx_le_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x76,0x7d] +v_cmpx_le_u16 exec_lo, v2.l +// GFX11: v_cmpx_le_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x76,0x7d] -v_cmpx_le_u16 ttmp15, v2 -// GFX11: v_cmpx_le_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x76,0x7d] +v_cmpx_le_u16 exec_hi, v2.l +// GFX11: v_cmpx_le_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x76,0x7d] -v_cmpx_le_u16 m0, v2 -// GFX11: v_cmpx_le_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x76,0x7d] +v_cmpx_le_u16 null, v2.l +// GFX11: v_cmpx_le_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x76,0x7d] -v_cmpx_le_u16 exec_lo, v2 -// GFX11: v_cmpx_le_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x76,0x7d] +v_cmpx_le_u16 -1, v2.l +// GFX11: v_cmpx_le_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x76,0x7d] -v_cmpx_le_u16 exec_hi, v2 -// GFX11: v_cmpx_le_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x76,0x7d] +v_cmpx_le_u16 0.5, v2.l +// GFX11: v_cmpx_le_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x76,0x7d] -v_cmpx_le_u16 null, v2 -// GFX11: v_cmpx_le_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x76,0x7d] +v_cmpx_le_u16 src_scc, v2.l +// GFX11: v_cmpx_le_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x76,0x7d] -v_cmpx_le_u16 -1, v2 -// GFX11: v_cmpx_le_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x76,0x7d] +v_cmpx_le_u16 0xfe0b, v127.l +// GFX11: v_cmpx_le_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_le_u16 0.5, v2 -// GFX11: v_cmpx_le_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x76,0x7d] +v_cmpx_le_u16 v1.h, v2.l +// GFX11: v_cmpx_le_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x76,0x7d] -v_cmpx_le_u16 src_scc, v2 -// GFX11: v_cmpx_le_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x76,0x7d] +v_cmpx_le_u16 v127.h, v2.l +// GFX11: v_cmpx_le_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x76,0x7d] -v_cmpx_le_u16 0xfe0b, v127 -// GFX11: v_cmpx_le_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_le_u16 0.5, v127.l +// GFX11: v_cmpx_le_u16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x76,0x7d] + +v_cmpx_le_u16 src_scc, v2.h +// GFX11: v_cmpx_le_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x77,0x7d] + +v_cmpx_le_u16 0xfe0b, v127.h +// GFX11: v_cmpx_le_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_le_u32 v1, v2 // GFX11: v_cmpx_le_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x96,0x7d] @@ -2210,50 +2330,65 @@ v_cmpx_lt_f64 src_scc, v[2:3] v_cmpx_lt_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_lt_i16 v1, v2 -// GFX11: v_cmpx_lt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x62,0x7d] +v_cmpx_lt_i16 v1.l, v2.l +// GFX11: v_cmpx_lt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x62,0x7d] + +v_cmpx_lt_i16 v127.l, v2.l +// GFX11: v_cmpx_lt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x62,0x7d] + +v_cmpx_lt_i16 s1, v2.l +// GFX11: v_cmpx_lt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x62,0x7d] + +v_cmpx_lt_i16 s105, v2.l +// GFX11: v_cmpx_lt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x62,0x7d] + +v_cmpx_lt_i16 vcc_lo, v2.l +// GFX11: v_cmpx_lt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x62,0x7d] + +v_cmpx_lt_i16 vcc_hi, v2.l +// GFX11: v_cmpx_lt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x62,0x7d] -v_cmpx_lt_i16 v127, v2 -// GFX11: v_cmpx_lt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x62,0x7d] +v_cmpx_lt_i16 ttmp15, v2.l +// GFX11: v_cmpx_lt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x62,0x7d] -v_cmpx_lt_i16 s1, v2 -// GFX11: v_cmpx_lt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x62,0x7d] +v_cmpx_lt_i16 m0, v2.l +// GFX11: v_cmpx_lt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x62,0x7d] -v_cmpx_lt_i16 s105, v2 -// GFX11: v_cmpx_lt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x62,0x7d] +v_cmpx_lt_i16 exec_lo, v2.l +// GFX11: v_cmpx_lt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x62,0x7d] -v_cmpx_lt_i16 vcc_lo, v2 -// GFX11: v_cmpx_lt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x62,0x7d] +v_cmpx_lt_i16 exec_hi, v2.l +// GFX11: v_cmpx_lt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x62,0x7d] -v_cmpx_lt_i16 vcc_hi, v2 -// GFX11: v_cmpx_lt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x62,0x7d] +v_cmpx_lt_i16 null, v2.l +// GFX11: v_cmpx_lt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x62,0x7d] -v_cmpx_lt_i16 ttmp15, v2 -// GFX11: v_cmpx_lt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x62,0x7d] +v_cmpx_lt_i16 -1, v2.l +// GFX11: v_cmpx_lt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x62,0x7d] -v_cmpx_lt_i16 m0, v2 -// GFX11: v_cmpx_lt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x62,0x7d] +v_cmpx_lt_i16 0.5, v2.l +// GFX11: v_cmpx_lt_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x62,0x7d] -v_cmpx_lt_i16 exec_lo, v2 -// GFX11: v_cmpx_lt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x62,0x7d] +v_cmpx_lt_i16 src_scc, v2.l +// GFX11: v_cmpx_lt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x62,0x7d] -v_cmpx_lt_i16 exec_hi, v2 -// GFX11: v_cmpx_lt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x62,0x7d] +v_cmpx_lt_i16 0xfe0b, v127.l +// GFX11: v_cmpx_lt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_lt_i16 null, v2 -// GFX11: v_cmpx_lt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x62,0x7d] +v_cmpx_lt_i16 v1.h, v2.l +// GFX11: v_cmpx_lt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x62,0x7d] -v_cmpx_lt_i16 -1, v2 -// GFX11: v_cmpx_lt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x62,0x7d] +v_cmpx_lt_i16 v127.h, v2.l +// GFX11: v_cmpx_lt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x62,0x7d] -v_cmpx_lt_i16 0.5, v2 -// GFX11: v_cmpx_lt_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x62,0x7d] +v_cmpx_lt_i16 0.5, v127.l +// GFX11: v_cmpx_lt_i16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x62,0x7d] -v_cmpx_lt_i16 src_scc, v2 -// GFX11: v_cmpx_lt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x62,0x7d] +v_cmpx_lt_i16 src_scc, v2.h +// GFX11: v_cmpx_lt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x63,0x7d] -v_cmpx_lt_i16 0xfe0b, v127 -// GFX11: v_cmpx_lt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_i16 0xfe0b, v127.h +// GFX11: v_cmpx_lt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_lt_i32 v1, v2 // GFX11: v_cmpx_lt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x82,0x7d] @@ -2336,50 +2471,65 @@ v_cmpx_lt_i64 src_scc, v[2:3] v_cmpx_lt_i64 0xaf123456, v[254:255] // GFX11: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_lt_u16 v1, v2 -// GFX11: v_cmpx_lt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x72,0x7d] +v_cmpx_lt_u16 v1.l, v2.l +// GFX11: v_cmpx_lt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x72,0x7d] -v_cmpx_lt_u16 v127, v2 -// GFX11: v_cmpx_lt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x72,0x7d] +v_cmpx_lt_u16 v127.l, v2.l +// GFX11: v_cmpx_lt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x72,0x7d] -v_cmpx_lt_u16 s1, v2 -// GFX11: v_cmpx_lt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x72,0x7d] +v_cmpx_lt_u16 s1, v2.l +// GFX11: v_cmpx_lt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x72,0x7d] -v_cmpx_lt_u16 s105, v2 -// GFX11: v_cmpx_lt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x72,0x7d] +v_cmpx_lt_u16 s105, v2.l +// GFX11: v_cmpx_lt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x72,0x7d] -v_cmpx_lt_u16 vcc_lo, v2 -// GFX11: v_cmpx_lt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x72,0x7d] +v_cmpx_lt_u16 vcc_lo, v2.l +// GFX11: v_cmpx_lt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x72,0x7d] -v_cmpx_lt_u16 vcc_hi, v2 -// GFX11: v_cmpx_lt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x72,0x7d] +v_cmpx_lt_u16 vcc_hi, v2.l +// GFX11: v_cmpx_lt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x72,0x7d] -v_cmpx_lt_u16 ttmp15, v2 -// GFX11: v_cmpx_lt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x72,0x7d] +v_cmpx_lt_u16 ttmp15, v2.l +// GFX11: v_cmpx_lt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x72,0x7d] -v_cmpx_lt_u16 m0, v2 -// GFX11: v_cmpx_lt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x72,0x7d] +v_cmpx_lt_u16 m0, v2.l +// GFX11: v_cmpx_lt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x72,0x7d] -v_cmpx_lt_u16 exec_lo, v2 -// GFX11: v_cmpx_lt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x72,0x7d] +v_cmpx_lt_u16 exec_lo, v2.l +// GFX11: v_cmpx_lt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x72,0x7d] -v_cmpx_lt_u16 exec_hi, v2 -// GFX11: v_cmpx_lt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x72,0x7d] +v_cmpx_lt_u16 exec_hi, v2.l +// GFX11: v_cmpx_lt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x72,0x7d] -v_cmpx_lt_u16 null, v2 -// GFX11: v_cmpx_lt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x72,0x7d] +v_cmpx_lt_u16 null, v2.l +// GFX11: v_cmpx_lt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x72,0x7d] -v_cmpx_lt_u16 -1, v2 -// GFX11: v_cmpx_lt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x72,0x7d] +v_cmpx_lt_u16 -1, v2.l +// GFX11: v_cmpx_lt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x72,0x7d] -v_cmpx_lt_u16 0.5, v2 -// GFX11: v_cmpx_lt_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x72,0x7d] +v_cmpx_lt_u16 0.5, v2.l +// GFX11: v_cmpx_lt_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x72,0x7d] -v_cmpx_lt_u16 src_scc, v2 -// GFX11: v_cmpx_lt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x72,0x7d] +v_cmpx_lt_u16 src_scc, v2.l +// GFX11: v_cmpx_lt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x72,0x7d] -v_cmpx_lt_u16 0xfe0b, v127 -// GFX11: v_cmpx_lt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_u16 0xfe0b, v127.l +// GFX11: v_cmpx_lt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_lt_u16 v1.h, v2.l +// GFX11: v_cmpx_lt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x72,0x7d] + +v_cmpx_lt_u16 v127.h, v2.l +// GFX11: v_cmpx_lt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x72,0x7d] + +v_cmpx_lt_u16 0.5, v127.l +// GFX11: v_cmpx_lt_u16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x72,0x7d] + +v_cmpx_lt_u16 src_scc, v2.h +// GFX11: v_cmpx_lt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x73,0x7d] + +v_cmpx_lt_u16 0xfe0b, v127.h +// GFX11: v_cmpx_lt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_lt_u32 v1, v2 // GFX11: v_cmpx_lt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x92,0x7d] @@ -2462,50 +2612,65 @@ v_cmpx_lt_u64 src_scc, v[2:3] v_cmpx_lt_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ne_i16 v1, v2 -// GFX11: v_cmpx_ne_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6a,0x7d] +v_cmpx_ne_i16 v1.l, v2.l +// GFX11: v_cmpx_ne_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6a,0x7d] + +v_cmpx_ne_i16 v127.l, v2.l +// GFX11: v_cmpx_ne_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6a,0x7d] + +v_cmpx_ne_i16 s1, v2.l +// GFX11: v_cmpx_ne_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6a,0x7d] + +v_cmpx_ne_i16 s105, v2.l +// GFX11: v_cmpx_ne_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6a,0x7d] -v_cmpx_ne_i16 v127, v2 -// GFX11: v_cmpx_ne_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6a,0x7d] +v_cmpx_ne_i16 vcc_lo, v2.l +// GFX11: v_cmpx_ne_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6a,0x7d] -v_cmpx_ne_i16 s1, v2 -// GFX11: v_cmpx_ne_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6a,0x7d] +v_cmpx_ne_i16 vcc_hi, v2.l +// GFX11: v_cmpx_ne_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6a,0x7d] -v_cmpx_ne_i16 s105, v2 -// GFX11: v_cmpx_ne_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6a,0x7d] +v_cmpx_ne_i16 ttmp15, v2.l +// GFX11: v_cmpx_ne_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6a,0x7d] -v_cmpx_ne_i16 vcc_lo, v2 -// GFX11: v_cmpx_ne_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6a,0x7d] +v_cmpx_ne_i16 m0, v2.l +// GFX11: v_cmpx_ne_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6a,0x7d] -v_cmpx_ne_i16 vcc_hi, v2 -// GFX11: v_cmpx_ne_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6a,0x7d] +v_cmpx_ne_i16 exec_lo, v2.l +// GFX11: v_cmpx_ne_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6a,0x7d] -v_cmpx_ne_i16 ttmp15, v2 -// GFX11: v_cmpx_ne_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6a,0x7d] +v_cmpx_ne_i16 exec_hi, v2.l +// GFX11: v_cmpx_ne_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6a,0x7d] -v_cmpx_ne_i16 m0, v2 -// GFX11: v_cmpx_ne_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6a,0x7d] +v_cmpx_ne_i16 null, v2.l +// GFX11: v_cmpx_ne_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6a,0x7d] -v_cmpx_ne_i16 exec_lo, v2 -// GFX11: v_cmpx_ne_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6a,0x7d] +v_cmpx_ne_i16 -1, v2.l +// GFX11: v_cmpx_ne_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6a,0x7d] -v_cmpx_ne_i16 exec_hi, v2 -// GFX11: v_cmpx_ne_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6a,0x7d] +v_cmpx_ne_i16 0.5, v2.l +// GFX11: v_cmpx_ne_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x6a,0x7d] -v_cmpx_ne_i16 null, v2 -// GFX11: v_cmpx_ne_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6a,0x7d] +v_cmpx_ne_i16 src_scc, v2.l +// GFX11: v_cmpx_ne_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6a,0x7d] -v_cmpx_ne_i16 -1, v2 -// GFX11: v_cmpx_ne_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6a,0x7d] +v_cmpx_ne_i16 0xfe0b, v127.l +// GFX11: v_cmpx_ne_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ne_i16 0.5, v2 -// GFX11: v_cmpx_ne_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x6a,0x7d] +v_cmpx_ne_i16 v1.h, v2.l +// GFX11: v_cmpx_ne_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6a,0x7d] -v_cmpx_ne_i16 src_scc, v2 -// GFX11: v_cmpx_ne_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6a,0x7d] +v_cmpx_ne_i16 v127.h, v2.l +// GFX11: v_cmpx_ne_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6a,0x7d] -v_cmpx_ne_i16 0xfe0b, v127 -// GFX11: v_cmpx_ne_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_i16 0.5, v127.l +// GFX11: v_cmpx_ne_i16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x6a,0x7d] + +v_cmpx_ne_i16 src_scc, v2.h +// GFX11: v_cmpx_ne_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6b,0x7d] + +v_cmpx_ne_i16 0xfe0b, v127.h +// GFX11: v_cmpx_ne_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ne_i32 v1, v2 // GFX11: v_cmpx_ne_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8a,0x7d] @@ -2588,50 +2753,65 @@ v_cmpx_ne_i64 src_scc, v[2:3] v_cmpx_ne_i64 0xaf123456, v[254:255] // GFX11: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ne_u16 v1, v2 -// GFX11: v_cmpx_ne_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7a,0x7d] +v_cmpx_ne_u16 v1.l, v2.l +// GFX11: v_cmpx_ne_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7a,0x7d] + +v_cmpx_ne_u16 v127.l, v2.l +// GFX11: v_cmpx_ne_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7a,0x7d] + +v_cmpx_ne_u16 s1, v2.l +// GFX11: v_cmpx_ne_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7a,0x7d] + +v_cmpx_ne_u16 s105, v2.l +// GFX11: v_cmpx_ne_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7a,0x7d] + +v_cmpx_ne_u16 vcc_lo, v2.l +// GFX11: v_cmpx_ne_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7a,0x7d] + +v_cmpx_ne_u16 vcc_hi, v2.l +// GFX11: v_cmpx_ne_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7a,0x7d] -v_cmpx_ne_u16 v127, v2 -// GFX11: v_cmpx_ne_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7a,0x7d] +v_cmpx_ne_u16 ttmp15, v2.l +// GFX11: v_cmpx_ne_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7a,0x7d] -v_cmpx_ne_u16 s1, v2 -// GFX11: v_cmpx_ne_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7a,0x7d] +v_cmpx_ne_u16 m0, v2.l +// GFX11: v_cmpx_ne_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7a,0x7d] -v_cmpx_ne_u16 s105, v2 -// GFX11: v_cmpx_ne_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7a,0x7d] +v_cmpx_ne_u16 exec_lo, v2.l +// GFX11: v_cmpx_ne_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7a,0x7d] -v_cmpx_ne_u16 vcc_lo, v2 -// GFX11: v_cmpx_ne_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7a,0x7d] +v_cmpx_ne_u16 exec_hi, v2.l +// GFX11: v_cmpx_ne_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7a,0x7d] -v_cmpx_ne_u16 vcc_hi, v2 -// GFX11: v_cmpx_ne_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7a,0x7d] +v_cmpx_ne_u16 null, v2.l +// GFX11: v_cmpx_ne_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7a,0x7d] -v_cmpx_ne_u16 ttmp15, v2 -// GFX11: v_cmpx_ne_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7a,0x7d] +v_cmpx_ne_u16 -1, v2.l +// GFX11: v_cmpx_ne_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7a,0x7d] -v_cmpx_ne_u16 m0, v2 -// GFX11: v_cmpx_ne_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7a,0x7d] +v_cmpx_ne_u16 0.5, v2.l +// GFX11: v_cmpx_ne_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x7a,0x7d] -v_cmpx_ne_u16 exec_lo, v2 -// GFX11: v_cmpx_ne_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7a,0x7d] +v_cmpx_ne_u16 src_scc, v2.l +// GFX11: v_cmpx_ne_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7a,0x7d] -v_cmpx_ne_u16 exec_hi, v2 -// GFX11: v_cmpx_ne_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7a,0x7d] +v_cmpx_ne_u16 0xfe0b, v127.l +// GFX11: v_cmpx_ne_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ne_u16 null, v2 -// GFX11: v_cmpx_ne_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7a,0x7d] +v_cmpx_ne_u16 v1.h, v2.l +// GFX11: v_cmpx_ne_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7a,0x7d] -v_cmpx_ne_u16 -1, v2 -// GFX11: v_cmpx_ne_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7a,0x7d] +v_cmpx_ne_u16 v127.h, v2.l +// GFX11: v_cmpx_ne_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7a,0x7d] -v_cmpx_ne_u16 0.5, v2 -// GFX11: v_cmpx_ne_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x7a,0x7d] +v_cmpx_ne_u16 0.5, v127.l +// GFX11: v_cmpx_ne_u16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x7a,0x7d] -v_cmpx_ne_u16 src_scc, v2 -// GFX11: v_cmpx_ne_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7a,0x7d] +v_cmpx_ne_u16 src_scc, v2.h +// GFX11: v_cmpx_ne_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7b,0x7d] -v_cmpx_ne_u16 0xfe0b, v127 -// GFX11: v_cmpx_ne_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_u16 0xfe0b, v127.h +// GFX11: v_cmpx_ne_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ne_u32 v1, v2 // GFX11: v_cmpx_ne_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9a,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s index 2b565fa43bc2b8..ddaa30af953b8a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s @@ -179,47 +179,56 @@ v_cmpx_eq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_eq_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_eq_i16 v1, v2 row_mirror -// GFX11: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_half_mirror -// GFX11: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shl:1 -// GFX11: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shl:15 -// GFX11: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shr:1 -// GFX11: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shr:15 -// GFX11: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_ror:1 -// GFX11: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_ror:15 -// GFX11: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_eq_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_eq_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_eq_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_eq_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_eq_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_eq_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x65,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_eq_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff] @@ -263,47 +272,56 @@ v_cmpx_eq_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_eq_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_eq_u16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_mirror -// GFX11: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_half_mirror -// GFX11: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shl:1 -// GFX11: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shl:15 -// GFX11: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shr:1 -// GFX11: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shr:15 -// GFX11: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_ror:1 -// GFX11: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_ror:15 -// GFX11: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_eq_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_eq_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_eq_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_eq_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x75,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_eq_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff] @@ -599,47 +617,56 @@ v_cmpx_ge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_ge_i16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_ge_i16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_mirror -// GFX11: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_half_mirror -// GFX11: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shl:1 -// GFX11: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shl:15 -// GFX11: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shr:1 -// GFX11: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shr:15 -// GFX11: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_ror:1 -// GFX11: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ge_i16 v1, v2 row_ror:15 -// GFX11: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ge_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_ge_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ge_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ge_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6d,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ge_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff] @@ -683,47 +710,56 @@ v_cmpx_ge_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ge_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_ge_u16 v1, v2 row_mirror -// GFX11: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_half_mirror -// GFX11: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shl:1 -// GFX11: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shl:15 -// GFX11: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shr:1 -// GFX11: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shr:15 -// GFX11: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_ror:1 -// GFX11: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_ror:15 -// GFX11: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ge_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ge_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ge_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_ge_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_ge_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_ge_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7d,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_ge_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff] @@ -851,47 +887,56 @@ v_cmpx_gt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_gt_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_gt_i16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_mirror -// GFX11: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_half_mirror -// GFX11: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shl:1 -// GFX11: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shl:15 -// GFX11: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shr:1 -// GFX11: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shr:15 -// GFX11: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_ror:1 -// GFX11: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_ror:15 -// GFX11: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_gt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_gt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_gt_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_gt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x69,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_gt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff] @@ -935,47 +980,56 @@ v_cmpx_gt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_gt_u16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_gt_u16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_mirror -// GFX11: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_half_mirror -// GFX11: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shl:1 -// GFX11: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shl:15 -// GFX11: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shr:1 -// GFX11: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shr:15 -// GFX11: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_ror:1 -// GFX11: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_gt_u16 v1, v2 row_ror:15 -// GFX11: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_gt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_gt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_gt_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_gt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x79,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_gt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff] @@ -1103,47 +1157,56 @@ v_cmpx_le_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_le_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_le_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_le_i16 v1, v2 row_mirror -// GFX11: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_half_mirror -// GFX11: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shl:1 -// GFX11: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shl:15 -// GFX11: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shr:1 -// GFX11: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shr:15 -// GFX11: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_ror:1 -// GFX11: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_ror:15 -// GFX11: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_le_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_le_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_le_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_le_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_le_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_le_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_le_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x67,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_le_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff] @@ -1187,47 +1250,56 @@ v_cmpx_le_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_le_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_le_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_le_u16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_le_u16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_mirror -// GFX11: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_half_mirror -// GFX11: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shl:1 -// GFX11: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shl:15 -// GFX11: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shr:1 -// GFX11: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shr:15 -// GFX11: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_ror:1 -// GFX11: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_ror:15 -// GFX11: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_le_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_le_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_le_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_le_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_le_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x77,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_le_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff] @@ -1448,47 +1520,56 @@ v_cmpx_lt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_lt_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_lt_i16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_lt_i16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_mirror -// GFX11: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_half_mirror -// GFX11: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shl:1 -// GFX11: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shl:15 -// GFX11: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shr:1 -// GFX11: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shr:15 -// GFX11: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_ror:1 -// GFX11: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_lt_i16 v1, v2 row_ror:15 -// GFX11: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_lt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_lt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_lt_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_lt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x63,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_lt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff] @@ -1532,47 +1613,56 @@ v_cmpx_lt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_lt_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_lt_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_lt_u16 v1, v2 row_mirror -// GFX11: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_half_mirror -// GFX11: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shl:1 -// GFX11: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shl:15 -// GFX11: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shr:1 -// GFX11: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shr:15 -// GFX11: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_ror:1 -// GFX11: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_ror:15 -// GFX11: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_lt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_lt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_lt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_lt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_lt_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lt_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_lt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x73,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_lt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff] @@ -1616,47 +1706,56 @@ v_cmpx_lt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_ne_i16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_mirror -// GFX11: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_half_mirror -// GFX11: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shl:1 -// GFX11: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shl:15 -// GFX11: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shr:1 -// GFX11: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shr:15 -// GFX11: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_ror:1 -// GFX11: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_ror:15 -// GFX11: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ne_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_ne_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ne_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ne_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ne_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6b,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_ne_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ne_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff] @@ -1700,47 +1799,56 @@ v_cmpx_ne_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_ne_u16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_ne_u16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_mirror -// GFX11: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_half_mirror -// GFX11: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shl:1 -// GFX11: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shl:15 -// GFX11: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shr:1 -// GFX11: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shr:15 -// GFX11: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_ror:1 -// GFX11: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ne_u16 v1, v2 row_ror:15 -// GFX11: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ne_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_ne_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ne_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ne_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ne_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ne_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7b,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ne_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ne_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s index 5b2e9ae507b0c3..1cead89c0a82e7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s @@ -47,14 +47,23 @@ v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_eq_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_eq_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_eq_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_eq_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_eq_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05] @@ -65,14 +74,23 @@ v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_eq_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_eq_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_eq_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05] @@ -137,14 +155,23 @@ v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_ge_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ge_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ge_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05] @@ -155,14 +182,23 @@ v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ge_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_ge_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ge_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_ge_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_ge_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05] @@ -191,14 +227,23 @@ v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_gt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_gt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_gt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05] @@ -209,14 +254,23 @@ v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_gt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_gt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_gt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05] @@ -245,14 +299,23 @@ v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_le_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_le_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_le_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_le_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_le_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05] @@ -263,14 +326,23 @@ v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_le_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_le_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_le_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05] @@ -326,14 +398,23 @@ v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_lt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_lt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_lt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05] @@ -344,14 +425,23 @@ v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_lt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_lt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_lt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_lt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_lt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05] @@ -362,14 +452,23 @@ v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_ne_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ne_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_ne_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ne_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05] @@ -380,14 +479,23 @@ v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_ne_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ne_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ne_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ne_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ne_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s index 50a30ecf3ba122..5cab502e996471 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s @@ -55,41 +55,77 @@ v_cmpx_eq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_f_f16_e32 v1, v255 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -127,41 +163,77 @@ v_cmpx_ge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_gt_f16_e32 v1, v255 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -181,41 +253,77 @@ v_cmpx_gt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_le_f16_e32 v1, v255 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -235,41 +343,77 @@ v_cmpx_le_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_le_i16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_u16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_lg_f16_e32 v1, v255 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -325,77 +469,149 @@ v_cmpx_lt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_lt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_neq_f16_e32 v1, v255 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s index b7e1976a7ccf92..5102a320750663 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s @@ -55,41 +55,77 @@ v_cmpx_eq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_f16 v255, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_eq_i16 v1, v255 -// GFX11: v_cmpx_eq_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_eq_i16 v1.h, v255.h +// GFX11: v_cmpx_eq_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16 v255, v2 -// GFX11: v_cmpx_eq_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_eq_i16 v1.l, v255.l +// GFX11: v_cmpx_eq_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_eq_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_u16 v1, v255 -// GFX11: v_cmpx_eq_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_eq_i16 v255.h, v2.h +// GFX11: v_cmpx_eq_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb2,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_eq_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_eq_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_eq_u16 v255, v2 -// GFX11: v_cmpx_eq_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_eq_i16 v255.l, v2.l +// GFX11: v_cmpx_eq_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_eq_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_eq_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_eq_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_eq_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v1.h, v255.h +// GFX11: v_cmpx_eq_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xba,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_eq_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v1.l, v255.l +// GFX11: v_cmpx_eq_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_eq_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v255.h, v2.h +// GFX11: v_cmpx_eq_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xba,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v255.l, v2.l +// GFX11: v_cmpx_eq_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_f_f16 v1, v255 // GFX11: v_cmpx_f_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0xff,0x03,0x00] @@ -127,41 +163,77 @@ v_cmpx_ge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_f16 v255, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_i16 v1, v255 -// GFX11: v_cmpx_ge_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ge_i16 v1.h, v255.h +// GFX11: v_cmpx_ge_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ge_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16 v1.l, v255.l +// GFX11: v_cmpx_ge_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ge_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16 v255.h, v2.h +// GFX11: v_cmpx_ge_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb6,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ge_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ge_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_i16 v255, v2 -// GFX11: v_cmpx_ge_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ge_i16 v255.l, v2.l +// GFX11: v_cmpx_ge_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ge_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ge_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ge_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ge_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_u16 v1, v255 -// GFX11: v_cmpx_ge_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ge_u16 v1.h, v255.h +// GFX11: v_cmpx_ge_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbe,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ge_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16 v255, v2 -// GFX11: v_cmpx_ge_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ge_u16 v1.l, v255.l +// GFX11: v_cmpx_ge_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ge_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ge_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_u16 v255.h, v2.h +// GFX11: v_cmpx_ge_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbe,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ge_u16 v255.l, v2.l +// GFX11: v_cmpx_ge_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_f16 v1, v255 // GFX11: v_cmpx_gt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00] @@ -181,41 +253,77 @@ v_cmpx_gt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_f16 v255, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_gt_i16 v1, v255 -// GFX11: v_cmpx_gt_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_gt_i16 v1.h, v255.h +// GFX11: v_cmpx_gt_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_gt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v1.l, v255.l +// GFX11: v_cmpx_gt_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_gt_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v255.h, v2.h +// GFX11: v_cmpx_gt_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb4,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_gt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_gt_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v255.l, v2.l +// GFX11: v_cmpx_gt_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_gt_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_gt_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_gt_u16 v1.h, v255.h +// GFX11: v_cmpx_gt_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbc,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_gt_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_i16 v255, v2 -// GFX11: v_cmpx_gt_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_gt_u16 v1.l, v255.l +// GFX11: v_cmpx_gt_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_gt_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_gt_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_u16 v1, v255 -// GFX11: v_cmpx_gt_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_gt_u16 v255.h, v2.h +// GFX11: v_cmpx_gt_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbc,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_gt_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_gt_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_gt_u16 v255, v2 -// GFX11: v_cmpx_gt_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_gt_u16 v255.l, v2.l +// GFX11: v_cmpx_gt_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_gt_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_gt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_gt_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_f16 v1, v255 // GFX11: v_cmpx_le_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00] @@ -235,41 +343,77 @@ v_cmpx_le_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_f16 v255, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_i16 v1, v255 -// GFX11: v_cmpx_le_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_le_i16 v1.h, v255.h +// GFX11: v_cmpx_le_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_le_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16 v255, v2 -// GFX11: v_cmpx_le_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_le_i16 v1.l, v255.l +// GFX11: v_cmpx_le_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_le_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_le_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_le_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_u16 v1, v255 -// GFX11: v_cmpx_le_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_le_i16 v255.h, v2.h +// GFX11: v_cmpx_le_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb3,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_le_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_u16 v255, v2 -// GFX11: v_cmpx_le_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_le_i16 v255.l, v2.l +// GFX11: v_cmpx_le_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_le_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_le_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_le_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_le_u16 v1.h, v255.h +// GFX11: v_cmpx_le_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbb,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16 v1.l, v255.l +// GFX11: v_cmpx_le_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16 v255.h, v2.h +// GFX11: v_cmpx_le_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbb,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_le_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_le_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_le_u16 v255.l, v2.l +// GFX11: v_cmpx_le_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_le_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lg_f16 v1, v255 // GFX11: v_cmpx_lg_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00] @@ -325,77 +469,149 @@ v_cmpx_lt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_lt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_lt_i16 v1, v255 -// GFX11: v_cmpx_lt_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_lt_i16 v1.h, v255.h +// GFX11: v_cmpx_lt_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb1,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v1.l, v255.l +// GFX11: v_cmpx_lt_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v255.h, v2.h +// GFX11: v_cmpx_lt_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb1,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v255.l, v2.l +// GFX11: v_cmpx_lt_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v1.h, v255.h +// GFX11: v_cmpx_lt_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb9,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v1.l, v255.l +// GFX11: v_cmpx_lt_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v255.h, v2.h +// GFX11: v_cmpx_lt_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb9,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v255.l, v2.l +// GFX11: v_cmpx_lt_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ne_i16 v1.h, v255.h +// GFX11: v_cmpx_ne_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb5,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_lt_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_i16 v255, v2 -// GFX11: v_cmpx_lt_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_i16 v1.l, v255.l +// GFX11: v_cmpx_ne_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_lt_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16 v1, v255 -// GFX11: v_cmpx_lt_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ne_i16 v255.h, v2.h +// GFX11: v_cmpx_ne_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb5,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_lt_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_lt_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_lt_u16 v255, v2 -// GFX11: v_cmpx_lt_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_i16 v255.l, v2.l +// GFX11: v_cmpx_ne_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_lt_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lt_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_lt_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lt_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ne_i16 v1, v255 -// GFX11: v_cmpx_ne_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ne_u16 v1.h, v255.h +// GFX11: v_cmpx_ne_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ne_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ne_i16 v255, v2 -// GFX11: v_cmpx_ne_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_u16 v1.l, v255.l +// GFX11: v_cmpx_ne_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ne_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ne_u16 v1, v255 -// GFX11: v_cmpx_ne_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ne_u16 v255.h, v2.h +// GFX11: v_cmpx_ne_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbd,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ne_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ne_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ne_u16 v255, v2 -// GFX11: v_cmpx_ne_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_u16 v255.l, v2.l +// GFX11: v_cmpx_ne_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ne_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ne_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ne_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ne_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_neq_f16 v1, v255 // GFX11: v_cmpx_neq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s index 1540f498c0b21a..d7bec00b830807 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s @@ -272,11 +272,11 @@ v_cmpx_eq_f64_e64 -|src_scc|, -|exec| v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_eq_i16_e64 v1, v2 -// GFX12: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_eq_i16_e64 v1.l, v2.l +// GFX12: v_cmpx_eq_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_eq_i16_e64 v255, v255 -// GFX12: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_eq_i16_e64 v255.l, v255.l +// GFX12: v_cmpx_eq_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_i16_e64 s1, s2 // GFX12: v_cmpx_eq_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00] @@ -317,6 +317,12 @@ v_cmpx_eq_i16_e64 src_scc, vcc_lo v_cmpx_eq_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_i16_e64 v1.h, v2.l +// GFX12: v_cmpx_eq_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_eq_i16_e64 v255.l, v255.h +// GFX12: v_cmpx_eq_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_eq_i32_e64 v1, v2 // GFX12: v_cmpx_eq_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00] @@ -398,11 +404,11 @@ v_cmpx_eq_i64_e64 src_scc, exec v_cmpx_eq_i64_e64 0xaf123456, vcc // GFX12: v_cmpx_eq_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_eq_u16_e64 v1, v2 -// GFX12: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_eq_u16_e64 v1.l, v2.l +// GFX12: v_cmpx_eq_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_eq_u16_e64 v255, v255 -// GFX12: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_eq_u16_e64 v255.l, v255.l +// GFX12: v_cmpx_eq_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_u16_e64 s1, s2 // GFX12: v_cmpx_eq_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00] @@ -443,6 +449,12 @@ v_cmpx_eq_u16_e64 src_scc, vcc_lo v_cmpx_eq_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_u16_e64 v1.h, v2.l +// GFX12: v_cmpx_eq_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_eq_u16_e64 v255.l, v255.h +// GFX12: v_cmpx_eq_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_eq_u32_e64 v1, v2 // GFX12: v_cmpx_eq_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00] @@ -650,11 +662,11 @@ v_cmpx_ge_f64_e64 -|src_scc|, -|exec| v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_ge_i16_e64 v1, v2 -// GFX12: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ge_i16_e64 v1.l, v2.l +// GFX12: v_cmpx_ge_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ge_i16_e64 v255, v255 -// GFX12: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ge_i16_e64 v255.l, v255.l +// GFX12: v_cmpx_ge_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_i16_e64 s1, s2 // GFX12: v_cmpx_ge_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00] @@ -695,6 +707,12 @@ v_cmpx_ge_i16_e64 src_scc, vcc_lo v_cmpx_ge_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_i16_e64 v1.h, v2.l +// GFX12: v_cmpx_ge_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ge_i16_e64 v255.l, v255.h +// GFX12: v_cmpx_ge_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ge_i32_e64 v1, v2 // GFX12: v_cmpx_ge_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00] @@ -776,11 +794,11 @@ v_cmpx_ge_i64_e64 src_scc, exec v_cmpx_ge_i64_e64 0xaf123456, vcc // GFX12: v_cmpx_ge_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ge_u16_e64 v1, v2 -// GFX12: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ge_u16_e64 v1.l, v2.l +// GFX12: v_cmpx_ge_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ge_u16_e64 v255, v255 -// GFX12: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ge_u16_e64 v255.l, v255.l +// GFX12: v_cmpx_ge_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_u16_e64 s1, s2 // GFX12: v_cmpx_ge_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00] @@ -821,6 +839,12 @@ v_cmpx_ge_u16_e64 src_scc, vcc_lo v_cmpx_ge_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_u16_e64 v1.h, v2.l +// GFX12: v_cmpx_ge_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ge_u16_e64 v255.l, v255.h +// GFX12: v_cmpx_ge_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ge_u32_e64 v1, v2 // GFX12: v_cmpx_ge_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00] @@ -1028,11 +1052,11 @@ v_cmpx_gt_f64_e64 -|src_scc|, -|exec| v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_gt_i16_e64 v1, v2 -// GFX12: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_gt_i16_e64 v1.l, v2.l +// GFX12: v_cmpx_gt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_gt_i16_e64 v255, v255 -// GFX12: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_gt_i16_e64 v255.l, v255.l +// GFX12: v_cmpx_gt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_i16_e64 s1, s2 // GFX12: v_cmpx_gt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00] @@ -1073,6 +1097,12 @@ v_cmpx_gt_i16_e64 src_scc, vcc_lo v_cmpx_gt_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_i16_e64 v1.h, v2.l +// GFX12: v_cmpx_gt_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_gt_i16_e64 v255.l, v255.h +// GFX12: v_cmpx_gt_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_gt_i32_e64 v1, v2 // GFX12: v_cmpx_gt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00] @@ -1154,11 +1184,11 @@ v_cmpx_gt_i64_e64 src_scc, exec v_cmpx_gt_i64_e64 0xaf123456, vcc // GFX12: v_cmpx_gt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_gt_u16_e64 v1, v2 -// GFX12: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_gt_u16_e64 v1.l, v2.l +// GFX12: v_cmpx_gt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_gt_u16_e64 v255, v255 -// GFX12: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_gt_u16_e64 v255.l, v255.l +// GFX12: v_cmpx_gt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_u16_e64 s1, s2 // GFX12: v_cmpx_gt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00] @@ -1199,6 +1229,12 @@ v_cmpx_gt_u16_e64 src_scc, vcc_lo v_cmpx_gt_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_u16_e64 v1.h, v2.l +// GFX12: v_cmpx_gt_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_gt_u16_e64 v255.l, v255.h +// GFX12: v_cmpx_gt_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_gt_u32_e64 v1, v2 // GFX12: v_cmpx_gt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00] @@ -1406,11 +1442,11 @@ v_cmpx_le_f64_e64 -|src_scc|, -|exec| v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_le_i16_e64 v1, v2 -// GFX12: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_le_i16_e64 v1.l, v2.l +// GFX12: v_cmpx_le_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_le_i16_e64 v255, v255 -// GFX12: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_le_i16_e64 v255.l, v255.l +// GFX12: v_cmpx_le_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_i16_e64 s1, s2 // GFX12: v_cmpx_le_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00] @@ -1451,6 +1487,12 @@ v_cmpx_le_i16_e64 src_scc, vcc_lo v_cmpx_le_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_le_i16_e64 v1.h, v2.l +// GFX12: v_cmpx_le_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_le_i16_e64 v255.l, v255.h +// GFX12: v_cmpx_le_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_le_i32_e64 v1, v2 // GFX12: v_cmpx_le_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00] @@ -1532,11 +1574,11 @@ v_cmpx_le_i64_e64 src_scc, exec v_cmpx_le_i64_e64 0xaf123456, vcc // GFX12: v_cmpx_le_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_le_u16_e64 v1, v2 -// GFX12: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_le_u16_e64 v1.l, v2.l +// GFX12: v_cmpx_le_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_le_u16_e64 v255, v255 -// GFX12: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_le_u16_e64 v255.l, v255.l +// GFX12: v_cmpx_le_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_u16_e64 s1, s2 // GFX12: v_cmpx_le_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00] @@ -1577,6 +1619,12 @@ v_cmpx_le_u16_e64 src_scc, vcc_lo v_cmpx_le_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_le_u16_e64 v1.h, v2.l +// GFX12: v_cmpx_le_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_le_u16_e64 v255.l, v255.h +// GFX12: v_cmpx_le_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_le_u32_e64 v1, v2 // GFX12: v_cmpx_le_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00] @@ -1916,11 +1964,11 @@ v_cmpx_lt_f64_e64 -|src_scc|, -|exec| v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_lt_i16_e64 v1, v2 -// GFX12: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_lt_i16_e64 v1.l, v2.l +// GFX12: v_cmpx_lt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_lt_i16_e64 v255, v255 -// GFX12: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_lt_i16_e64 v255.l, v255.l +// GFX12: v_cmpx_lt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_i16_e64 s1, s2 // GFX12: v_cmpx_lt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00] @@ -1961,6 +2009,12 @@ v_cmpx_lt_i16_e64 src_scc, vcc_lo v_cmpx_lt_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_i16_e64 v1.h, v2.l +// GFX12: v_cmpx_lt_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_lt_i16_e64 v255.l, v255.h +// GFX12: v_cmpx_lt_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_lt_i32_e64 v1, v2 // GFX12: v_cmpx_lt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00] @@ -2042,11 +2096,11 @@ v_cmpx_lt_i64_e64 src_scc, exec v_cmpx_lt_i64_e64 0xaf123456, vcc // GFX12: v_cmpx_lt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_lt_u16_e64 v1, v2 -// GFX12: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_lt_u16_e64 v1.l, v2.l +// GFX12: v_cmpx_lt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_lt_u16_e64 v255, v255 -// GFX12: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_lt_u16_e64 v255.l, v255.l +// GFX12: v_cmpx_lt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_u16_e64 s1, s2 // GFX12: v_cmpx_lt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00] @@ -2087,6 +2141,12 @@ v_cmpx_lt_u16_e64 src_scc, vcc_lo v_cmpx_lt_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_u16_e64 v1.h, v2.l +// GFX12: v_cmpx_lt_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_lt_u16_e64 v255.l, v255.h +// GFX12: v_cmpx_lt_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_lt_u32_e64 v1, v2 // GFX12: v_cmpx_lt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00] @@ -2168,11 +2228,11 @@ v_cmpx_lt_u64_e64 src_scc, exec v_cmpx_lt_u64_e64 0xaf123456, vcc // GFX12: v_cmpx_lt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ne_i16_e64 v1, v2 -// GFX12: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ne_i16_e64 v1.l, v2.l +// GFX12: v_cmpx_ne_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ne_i16_e64 v255, v255 -// GFX12: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ne_i16_e64 v255.l, v255.l +// GFX12: v_cmpx_ne_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_i16_e64 s1, s2 // GFX12: v_cmpx_ne_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00] @@ -2213,6 +2273,12 @@ v_cmpx_ne_i16_e64 src_scc, vcc_lo v_cmpx_ne_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_i16_e64 v1.h, v2.l +// GFX12: v_cmpx_ne_i16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ne_i16_e64 v255.l, v255.h +// GFX12: v_cmpx_ne_i16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ne_i32_e64 v1, v2 // GFX12: v_cmpx_ne_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00] @@ -2294,11 +2360,11 @@ v_cmpx_ne_i64_e64 src_scc, exec v_cmpx_ne_i64_e64 0xaf123456, vcc // GFX12: v_cmpx_ne_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ne_u16_e64 v1, v2 -// GFX12: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ne_u16_e64 v1.l, v2.l +// GFX12: v_cmpx_ne_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ne_u16_e64 v255, v255 -// GFX12: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ne_u16_e64 v255.l, v255.l +// GFX12: v_cmpx_ne_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_u16_e64 s1, s2 // GFX12: v_cmpx_ne_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00] @@ -2339,6 +2405,12 @@ v_cmpx_ne_u16_e64 src_scc, vcc_lo v_cmpx_ne_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_u16_e64 v1.h, v2.l +// GFX12: v_cmpx_ne_u16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ne_u16_e64 v255.l, v255.h +// GFX12: v_cmpx_ne_u16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ne_u32_e64 v1, v2 // GFX12: v_cmpx_ne_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s index d51b4e35b0484d..faad68f902d5f8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s @@ -203,53 +203,62 @@ v_cmpx_eq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_eq_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_eq_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -299,53 +308,62 @@ v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_eq_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_eq_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_eq_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_eq_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_eq_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -491,53 +509,62 @@ v_cmpx_ge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ge_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ge_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ge_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ge_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ge_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -587,53 +614,62 @@ v_cmpx_ge_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ge_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_ge_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -779,53 +815,62 @@ v_cmpx_gt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_gt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_gt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_gt_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_gt_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_gt_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -875,53 +920,62 @@ v_cmpx_gt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_gt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_gt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_gt_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_gt_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_gt_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1067,53 +1121,62 @@ v_cmpx_le_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_le_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_le_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_le_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_le_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_le_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1163,53 +1226,62 @@ v_cmpx_le_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_le_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_le_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_le_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_le_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_le_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_le_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1460,53 +1532,62 @@ v_cmpx_lt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_lt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_lt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_lt_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_lt_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_lt_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1556,53 +1637,62 @@ v_cmpx_lt_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_lt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_lt_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1652,53 +1742,62 @@ v_cmpx_lt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ne_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ne_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ne_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ne_i16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ne_i16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cmpx_ne_i16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1748,53 +1847,62 @@ v_cmpx_ne_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ne_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ne_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_cmpx_ne_u16_e64_dpp v1.h, v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_cmpx_ne_u16_e64_dpp v1.h, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_cmpx_ne_u16_e64_dpp v255.l, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s index 928443c1d19590..588ad2b75a4106 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s @@ -77,20 +77,29 @@ v_cmpx_eq_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x92,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -107,20 +116,29 @@ v_cmpx_eq_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_eq_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_eq_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -173,20 +191,29 @@ v_cmpx_ge_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x96,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ge_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ge_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -203,20 +230,29 @@ v_cmpx_ge_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -269,20 +305,29 @@ v_cmpx_gt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x94,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_gt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_gt_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -299,20 +344,29 @@ v_cmpx_gt_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_gt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_gt_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -365,20 +419,29 @@ v_cmpx_le_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x93,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_le_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -395,20 +458,29 @@ v_cmpx_le_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_le_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_le_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -506,20 +578,29 @@ v_cmpx_lt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x91,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_lt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_lt_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -536,20 +617,29 @@ v_cmpx_lt_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -566,20 +656,29 @@ v_cmpx_lt_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ne_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ne_i16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -596,20 +695,29 @@ v_cmpx_ne_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ne_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16_e64_dpp v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16_e64_dpp v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x08,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_cmpx_ne_u16_e64_dpp v255.l, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s index cf0581edf9e365..4d43b98978eb56 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s @@ -266,50 +266,62 @@ v_cmpx_eq_f64 src_scc, v[2:3] v_cmpx_eq_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_eq_i16 v1, v2 -// GFX12: v_cmpx_eq_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x64,0x7d] +v_cmpx_eq_i16 v1.l, v2.l +// GFX12: v_cmpx_eq_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x64,0x7d] -v_cmpx_eq_i16 v127, v2 -// GFX12: v_cmpx_eq_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x64,0x7d] +v_cmpx_eq_i16 v127.l, v2.l +// GFX12: v_cmpx_eq_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x64,0x7d] -v_cmpx_eq_i16 s1, v2 -// GFX12: v_cmpx_eq_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x64,0x7d] +v_cmpx_eq_i16 s1, v2.l +// GFX12: v_cmpx_eq_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x64,0x7d] -v_cmpx_eq_i16 s105, v2 -// GFX12: v_cmpx_eq_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x64,0x7d] +v_cmpx_eq_i16 s105, v2.l +// GFX12: v_cmpx_eq_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x64,0x7d] -v_cmpx_eq_i16 vcc_lo, v2 -// GFX12: v_cmpx_eq_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x64,0x7d] +v_cmpx_eq_i16 vcc_lo, v2.l +// GFX12: v_cmpx_eq_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x64,0x7d] -v_cmpx_eq_i16 vcc_hi, v2 -// GFX12: v_cmpx_eq_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x64,0x7d] +v_cmpx_eq_i16 vcc_hi, v2.l +// GFX12: v_cmpx_eq_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x64,0x7d] -v_cmpx_eq_i16 ttmp15, v2 -// GFX12: v_cmpx_eq_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x64,0x7d] +v_cmpx_eq_i16 ttmp15, v2.l +// GFX12: v_cmpx_eq_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x64,0x7d] -v_cmpx_eq_i16 m0, v2 -// GFX12: v_cmpx_eq_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x64,0x7d] +v_cmpx_eq_i16 m0, v2.l +// GFX12: v_cmpx_eq_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x64,0x7d] -v_cmpx_eq_i16 exec_lo, v2 -// GFX12: v_cmpx_eq_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x64,0x7d] +v_cmpx_eq_i16 exec_lo, v2.l +// GFX12: v_cmpx_eq_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x64,0x7d] -v_cmpx_eq_i16 exec_hi, v2 -// GFX12: v_cmpx_eq_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x64,0x7d] +v_cmpx_eq_i16 exec_hi, v2.l +// GFX12: v_cmpx_eq_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x64,0x7d] -v_cmpx_eq_i16 null, v2 -// GFX12: v_cmpx_eq_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x64,0x7d] +v_cmpx_eq_i16 null, v2.l +// GFX12: v_cmpx_eq_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x64,0x7d] -v_cmpx_eq_i16 -1, v2 -// GFX12: v_cmpx_eq_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x64,0x7d] +v_cmpx_eq_i16 -1, v2.l +// GFX12: v_cmpx_eq_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x64,0x7d] -v_cmpx_eq_i16 0.5, v2 -// GFX12: v_cmpx_eq_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x64,0x7d] +v_cmpx_eq_i16 0.5, v2.l +// GFX12: v_cmpx_eq_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x64,0x7d] -v_cmpx_eq_i16 src_scc, v2 -// GFX12: v_cmpx_eq_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x64,0x7d] +v_cmpx_eq_i16 src_scc, v2.l +// GFX12: v_cmpx_eq_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x64,0x7d] -v_cmpx_eq_i16 0xfe0b, v127 -// GFX12: v_cmpx_eq_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_i16 0xfe0b, v127.l +// GFX12: v_cmpx_eq_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_eq_i16 v1.h, v2.l +// GFX12: v_cmpx_eq_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x64,0x7d] + +v_cmpx_eq_i16 v127.h, v2.l +// GFX12: v_cmpx_eq_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x64,0x7d] + +v_cmpx_eq_i16 src_scc, v2.h +// GFX12: v_cmpx_eq_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x65,0x7d] + +v_cmpx_eq_i16 0xfe0b, v127.h +// GFX12: v_cmpx_eq_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_eq_i32 v1, v2 // GFX12: v_cmpx_eq_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x84,0x7d] @@ -392,50 +404,62 @@ v_cmpx_eq_i64 src_scc, v[2:3] v_cmpx_eq_i64 0xaf123456, v[254:255] // GFX12: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_eq_u16 v1, v2 -// GFX12: v_cmpx_eq_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x74,0x7d] +v_cmpx_eq_u16 v1.l, v2.l +// GFX12: v_cmpx_eq_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x74,0x7d] + +v_cmpx_eq_u16 v127.l, v2.l +// GFX12: v_cmpx_eq_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x74,0x7d] + +v_cmpx_eq_u16 s1, v2.l +// GFX12: v_cmpx_eq_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x74,0x7d] -v_cmpx_eq_u16 v127, v2 -// GFX12: v_cmpx_eq_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x74,0x7d] +v_cmpx_eq_u16 s105, v2.l +// GFX12: v_cmpx_eq_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x74,0x7d] -v_cmpx_eq_u16 s1, v2 -// GFX12: v_cmpx_eq_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x74,0x7d] +v_cmpx_eq_u16 vcc_lo, v2.l +// GFX12: v_cmpx_eq_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x74,0x7d] -v_cmpx_eq_u16 s105, v2 -// GFX12: v_cmpx_eq_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x74,0x7d] +v_cmpx_eq_u16 vcc_hi, v2.l +// GFX12: v_cmpx_eq_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x74,0x7d] -v_cmpx_eq_u16 vcc_lo, v2 -// GFX12: v_cmpx_eq_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x74,0x7d] +v_cmpx_eq_u16 ttmp15, v2.l +// GFX12: v_cmpx_eq_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x74,0x7d] -v_cmpx_eq_u16 vcc_hi, v2 -// GFX12: v_cmpx_eq_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x74,0x7d] +v_cmpx_eq_u16 m0, v2.l +// GFX12: v_cmpx_eq_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x74,0x7d] -v_cmpx_eq_u16 ttmp15, v2 -// GFX12: v_cmpx_eq_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x74,0x7d] +v_cmpx_eq_u16 exec_lo, v2.l +// GFX12: v_cmpx_eq_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x74,0x7d] -v_cmpx_eq_u16 m0, v2 -// GFX12: v_cmpx_eq_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x74,0x7d] +v_cmpx_eq_u16 exec_hi, v2.l +// GFX12: v_cmpx_eq_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x74,0x7d] -v_cmpx_eq_u16 exec_lo, v2 -// GFX12: v_cmpx_eq_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x74,0x7d] +v_cmpx_eq_u16 null, v2.l +// GFX12: v_cmpx_eq_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x74,0x7d] -v_cmpx_eq_u16 exec_hi, v2 -// GFX12: v_cmpx_eq_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x74,0x7d] +v_cmpx_eq_u16 -1, v2.l +// GFX12: v_cmpx_eq_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x74,0x7d] -v_cmpx_eq_u16 null, v2 -// GFX12: v_cmpx_eq_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x74,0x7d] +v_cmpx_eq_u16 0.5, v2.l +// GFX12: v_cmpx_eq_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x74,0x7d] -v_cmpx_eq_u16 -1, v2 -// GFX12: v_cmpx_eq_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x74,0x7d] +v_cmpx_eq_u16 src_scc, v2.l +// GFX12: v_cmpx_eq_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x74,0x7d] -v_cmpx_eq_u16 0.5, v2 -// GFX12: v_cmpx_eq_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x74,0x7d] +v_cmpx_eq_u16 0xfe0b, v127.l +// GFX12: v_cmpx_eq_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_eq_u16 src_scc, v2 -// GFX12: v_cmpx_eq_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x74,0x7d] +v_cmpx_eq_u16 v1.h, v2.l +// GFX12: v_cmpx_eq_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x74,0x7d] -v_cmpx_eq_u16 0xfe0b, v127 -// GFX12: v_cmpx_eq_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_u16 v127.h, v2.l +// GFX12: v_cmpx_eq_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x74,0x7d] + +v_cmpx_eq_u16 src_scc, v2.h +// GFX12: v_cmpx_eq_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x75,0x7d] + +v_cmpx_eq_u16 0xfe0b, v127.h +// GFX12: v_cmpx_eq_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_eq_u32 v1, v2 // GFX12: v_cmpx_eq_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x94,0x7d] @@ -644,50 +668,62 @@ v_cmpx_ge_f64 src_scc, v[2:3] v_cmpx_ge_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ge_i16 v1, v2 -// GFX12: v_cmpx_ge_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6c,0x7d] +v_cmpx_ge_i16 v1.l, v2.l +// GFX12: v_cmpx_ge_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6c,0x7d] + +v_cmpx_ge_i16 v127.l, v2.l +// GFX12: v_cmpx_ge_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6c,0x7d] + +v_cmpx_ge_i16 s1, v2.l +// GFX12: v_cmpx_ge_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6c,0x7d] + +v_cmpx_ge_i16 s105, v2.l +// GFX12: v_cmpx_ge_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6c,0x7d] + +v_cmpx_ge_i16 vcc_lo, v2.l +// GFX12: v_cmpx_ge_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6c,0x7d] -v_cmpx_ge_i16 v127, v2 -// GFX12: v_cmpx_ge_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6c,0x7d] +v_cmpx_ge_i16 vcc_hi, v2.l +// GFX12: v_cmpx_ge_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6c,0x7d] -v_cmpx_ge_i16 s1, v2 -// GFX12: v_cmpx_ge_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6c,0x7d] +v_cmpx_ge_i16 ttmp15, v2.l +// GFX12: v_cmpx_ge_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6c,0x7d] -v_cmpx_ge_i16 s105, v2 -// GFX12: v_cmpx_ge_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6c,0x7d] +v_cmpx_ge_i16 m0, v2.l +// GFX12: v_cmpx_ge_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6c,0x7d] -v_cmpx_ge_i16 vcc_lo, v2 -// GFX12: v_cmpx_ge_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6c,0x7d] +v_cmpx_ge_i16 exec_lo, v2.l +// GFX12: v_cmpx_ge_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6c,0x7d] -v_cmpx_ge_i16 vcc_hi, v2 -// GFX12: v_cmpx_ge_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6c,0x7d] +v_cmpx_ge_i16 exec_hi, v2.l +// GFX12: v_cmpx_ge_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6c,0x7d] -v_cmpx_ge_i16 ttmp15, v2 -// GFX12: v_cmpx_ge_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6c,0x7d] +v_cmpx_ge_i16 null, v2.l +// GFX12: v_cmpx_ge_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6c,0x7d] -v_cmpx_ge_i16 m0, v2 -// GFX12: v_cmpx_ge_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6c,0x7d] +v_cmpx_ge_i16 -1, v2.l +// GFX12: v_cmpx_ge_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6c,0x7d] -v_cmpx_ge_i16 exec_lo, v2 -// GFX12: v_cmpx_ge_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6c,0x7d] +v_cmpx_ge_i16 0.5, v2.l +// GFX12: v_cmpx_ge_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x6c,0x7d] -v_cmpx_ge_i16 exec_hi, v2 -// GFX12: v_cmpx_ge_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6c,0x7d] +v_cmpx_ge_i16 src_scc, v2.l +// GFX12: v_cmpx_ge_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6c,0x7d] -v_cmpx_ge_i16 null, v2 -// GFX12: v_cmpx_ge_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6c,0x7d] +v_cmpx_ge_i16 0xfe0b, v127.l +// GFX12: v_cmpx_ge_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ge_i16 -1, v2 -// GFX12: v_cmpx_ge_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6c,0x7d] +v_cmpx_ge_i16 v1.h, v2.l +// GFX12: v_cmpx_ge_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6c,0x7d] -v_cmpx_ge_i16 0.5, v2 -// GFX12: v_cmpx_ge_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x6c,0x7d] +v_cmpx_ge_i16 v127.h, v2.l +// GFX12: v_cmpx_ge_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6c,0x7d] -v_cmpx_ge_i16 src_scc, v2 -// GFX12: v_cmpx_ge_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6c,0x7d] +v_cmpx_ge_i16 src_scc, v2.h +// GFX12: v_cmpx_ge_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6d,0x7d] -v_cmpx_ge_i16 0xfe0b, v127 -// GFX12: v_cmpx_ge_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_i16 0xfe0b, v127.h +// GFX12: v_cmpx_ge_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ge_i32 v1, v2 // GFX12: v_cmpx_ge_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8c,0x7d] @@ -770,50 +806,62 @@ v_cmpx_ge_i64 src_scc, v[2:3] v_cmpx_ge_i64 0xaf123456, v[254:255] // GFX12: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ge_u16 v1, v2 -// GFX12: v_cmpx_ge_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7c,0x7d] +v_cmpx_ge_u16 v1.l, v2.l +// GFX12: v_cmpx_ge_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7c,0x7d] -v_cmpx_ge_u16 v127, v2 -// GFX12: v_cmpx_ge_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7c,0x7d] +v_cmpx_ge_u16 v127.l, v2.l +// GFX12: v_cmpx_ge_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7c,0x7d] -v_cmpx_ge_u16 s1, v2 -// GFX12: v_cmpx_ge_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7c,0x7d] +v_cmpx_ge_u16 s1, v2.l +// GFX12: v_cmpx_ge_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7c,0x7d] -v_cmpx_ge_u16 s105, v2 -// GFX12: v_cmpx_ge_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7c,0x7d] +v_cmpx_ge_u16 s105, v2.l +// GFX12: v_cmpx_ge_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7c,0x7d] -v_cmpx_ge_u16 vcc_lo, v2 -// GFX12: v_cmpx_ge_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7c,0x7d] +v_cmpx_ge_u16 vcc_lo, v2.l +// GFX12: v_cmpx_ge_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7c,0x7d] -v_cmpx_ge_u16 vcc_hi, v2 -// GFX12: v_cmpx_ge_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7c,0x7d] +v_cmpx_ge_u16 vcc_hi, v2.l +// GFX12: v_cmpx_ge_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7c,0x7d] -v_cmpx_ge_u16 ttmp15, v2 -// GFX12: v_cmpx_ge_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7c,0x7d] +v_cmpx_ge_u16 ttmp15, v2.l +// GFX12: v_cmpx_ge_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7c,0x7d] -v_cmpx_ge_u16 m0, v2 -// GFX12: v_cmpx_ge_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7c,0x7d] +v_cmpx_ge_u16 m0, v2.l +// GFX12: v_cmpx_ge_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7c,0x7d] -v_cmpx_ge_u16 exec_lo, v2 -// GFX12: v_cmpx_ge_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7c,0x7d] +v_cmpx_ge_u16 exec_lo, v2.l +// GFX12: v_cmpx_ge_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7c,0x7d] -v_cmpx_ge_u16 exec_hi, v2 -// GFX12: v_cmpx_ge_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7c,0x7d] +v_cmpx_ge_u16 exec_hi, v2.l +// GFX12: v_cmpx_ge_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7c,0x7d] -v_cmpx_ge_u16 null, v2 -// GFX12: v_cmpx_ge_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7c,0x7d] +v_cmpx_ge_u16 null, v2.l +// GFX12: v_cmpx_ge_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7c,0x7d] -v_cmpx_ge_u16 -1, v2 -// GFX12: v_cmpx_ge_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7c,0x7d] +v_cmpx_ge_u16 -1, v2.l +// GFX12: v_cmpx_ge_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7c,0x7d] -v_cmpx_ge_u16 0.5, v2 -// GFX12: v_cmpx_ge_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x7c,0x7d] +v_cmpx_ge_u16 0.5, v2.l +// GFX12: v_cmpx_ge_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x7c,0x7d] -v_cmpx_ge_u16 src_scc, v2 -// GFX12: v_cmpx_ge_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7c,0x7d] +v_cmpx_ge_u16 src_scc, v2.l +// GFX12: v_cmpx_ge_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7c,0x7d] -v_cmpx_ge_u16 0xfe0b, v127 -// GFX12: v_cmpx_ge_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_u16 0xfe0b, v127.l +// GFX12: v_cmpx_ge_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_ge_u16 v1.h, v2.l +// GFX12: v_cmpx_ge_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7c,0x7d] + +v_cmpx_ge_u16 v127.h, v2.l +// GFX12: v_cmpx_ge_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7c,0x7d] + +v_cmpx_ge_u16 src_scc, v2.h +// GFX12: v_cmpx_ge_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7d,0x7d] + +v_cmpx_ge_u16 0xfe0b, v127.h +// GFX12: v_cmpx_ge_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ge_u32 v1, v2 // GFX12: v_cmpx_ge_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9c,0x7d] @@ -1022,50 +1070,62 @@ v_cmpx_gt_f64 src_scc, v[2:3] v_cmpx_gt_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_gt_i16 v1, v2 -// GFX12: v_cmpx_gt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x68,0x7d] +v_cmpx_gt_i16 v1.l, v2.l +// GFX12: v_cmpx_gt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x68,0x7d] + +v_cmpx_gt_i16 v127.l, v2.l +// GFX12: v_cmpx_gt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x68,0x7d] + +v_cmpx_gt_i16 s1, v2.l +// GFX12: v_cmpx_gt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x68,0x7d] -v_cmpx_gt_i16 v127, v2 -// GFX12: v_cmpx_gt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x68,0x7d] +v_cmpx_gt_i16 s105, v2.l +// GFX12: v_cmpx_gt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x68,0x7d] -v_cmpx_gt_i16 s1, v2 -// GFX12: v_cmpx_gt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x68,0x7d] +v_cmpx_gt_i16 vcc_lo, v2.l +// GFX12: v_cmpx_gt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x68,0x7d] -v_cmpx_gt_i16 s105, v2 -// GFX12: v_cmpx_gt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x68,0x7d] +v_cmpx_gt_i16 vcc_hi, v2.l +// GFX12: v_cmpx_gt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x68,0x7d] -v_cmpx_gt_i16 vcc_lo, v2 -// GFX12: v_cmpx_gt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x68,0x7d] +v_cmpx_gt_i16 ttmp15, v2.l +// GFX12: v_cmpx_gt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x68,0x7d] -v_cmpx_gt_i16 vcc_hi, v2 -// GFX12: v_cmpx_gt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x68,0x7d] +v_cmpx_gt_i16 m0, v2.l +// GFX12: v_cmpx_gt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x68,0x7d] -v_cmpx_gt_i16 ttmp15, v2 -// GFX12: v_cmpx_gt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x68,0x7d] +v_cmpx_gt_i16 exec_lo, v2.l +// GFX12: v_cmpx_gt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x68,0x7d] -v_cmpx_gt_i16 m0, v2 -// GFX12: v_cmpx_gt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x68,0x7d] +v_cmpx_gt_i16 exec_hi, v2.l +// GFX12: v_cmpx_gt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x68,0x7d] -v_cmpx_gt_i16 exec_lo, v2 -// GFX12: v_cmpx_gt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x68,0x7d] +v_cmpx_gt_i16 null, v2.l +// GFX12: v_cmpx_gt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x68,0x7d] -v_cmpx_gt_i16 exec_hi, v2 -// GFX12: v_cmpx_gt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x68,0x7d] +v_cmpx_gt_i16 -1, v2.l +// GFX12: v_cmpx_gt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x68,0x7d] -v_cmpx_gt_i16 null, v2 -// GFX12: v_cmpx_gt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x68,0x7d] +v_cmpx_gt_i16 0.5, v2.l +// GFX12: v_cmpx_gt_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x68,0x7d] -v_cmpx_gt_i16 -1, v2 -// GFX12: v_cmpx_gt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x68,0x7d] +v_cmpx_gt_i16 src_scc, v2.l +// GFX12: v_cmpx_gt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x68,0x7d] -v_cmpx_gt_i16 0.5, v2 -// GFX12: v_cmpx_gt_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x68,0x7d] +v_cmpx_gt_i16 0xfe0b, v127.l +// GFX12: v_cmpx_gt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_gt_i16 src_scc, v2 -// GFX12: v_cmpx_gt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x68,0x7d] +v_cmpx_gt_i16 v1.h, v2.l +// GFX12: v_cmpx_gt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x68,0x7d] -v_cmpx_gt_i16 0xfe0b, v127 -// GFX12: v_cmpx_gt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_i16 v127.h, v2.l +// GFX12: v_cmpx_gt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x68,0x7d] + +v_cmpx_gt_i16 src_scc, v2.h +// GFX12: v_cmpx_gt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x69,0x7d] + +v_cmpx_gt_i16 0xfe0b, v127.h +// GFX12: v_cmpx_gt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_gt_i32 v1, v2 // GFX12: v_cmpx_gt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x88,0x7d] @@ -1148,50 +1208,62 @@ v_cmpx_gt_i64 src_scc, v[2:3] v_cmpx_gt_i64 0xaf123456, v[254:255] // GFX12: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_gt_u16 v1, v2 -// GFX12: v_cmpx_gt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x78,0x7d] +v_cmpx_gt_u16 v1.l, v2.l +// GFX12: v_cmpx_gt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x78,0x7d] + +v_cmpx_gt_u16 v127.l, v2.l +// GFX12: v_cmpx_gt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x78,0x7d] + +v_cmpx_gt_u16 s1, v2.l +// GFX12: v_cmpx_gt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x78,0x7d] + +v_cmpx_gt_u16 s105, v2.l +// GFX12: v_cmpx_gt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x78,0x7d] + +v_cmpx_gt_u16 vcc_lo, v2.l +// GFX12: v_cmpx_gt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x78,0x7d] -v_cmpx_gt_u16 v127, v2 -// GFX12: v_cmpx_gt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x78,0x7d] +v_cmpx_gt_u16 vcc_hi, v2.l +// GFX12: v_cmpx_gt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x78,0x7d] -v_cmpx_gt_u16 s1, v2 -// GFX12: v_cmpx_gt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x78,0x7d] +v_cmpx_gt_u16 ttmp15, v2.l +// GFX12: v_cmpx_gt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x78,0x7d] -v_cmpx_gt_u16 s105, v2 -// GFX12: v_cmpx_gt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x78,0x7d] +v_cmpx_gt_u16 m0, v2.l +// GFX12: v_cmpx_gt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x78,0x7d] -v_cmpx_gt_u16 vcc_lo, v2 -// GFX12: v_cmpx_gt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x78,0x7d] +v_cmpx_gt_u16 exec_lo, v2.l +// GFX12: v_cmpx_gt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x78,0x7d] -v_cmpx_gt_u16 vcc_hi, v2 -// GFX12: v_cmpx_gt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x78,0x7d] +v_cmpx_gt_u16 exec_hi, v2.l +// GFX12: v_cmpx_gt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x78,0x7d] -v_cmpx_gt_u16 ttmp15, v2 -// GFX12: v_cmpx_gt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x78,0x7d] +v_cmpx_gt_u16 null, v2.l +// GFX12: v_cmpx_gt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x78,0x7d] -v_cmpx_gt_u16 m0, v2 -// GFX12: v_cmpx_gt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x78,0x7d] +v_cmpx_gt_u16 -1, v2.l +// GFX12: v_cmpx_gt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x78,0x7d] -v_cmpx_gt_u16 exec_lo, v2 -// GFX12: v_cmpx_gt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x78,0x7d] +v_cmpx_gt_u16 0.5, v2.l +// GFX12: v_cmpx_gt_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x78,0x7d] -v_cmpx_gt_u16 exec_hi, v2 -// GFX12: v_cmpx_gt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x78,0x7d] +v_cmpx_gt_u16 src_scc, v2.l +// GFX12: v_cmpx_gt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x78,0x7d] -v_cmpx_gt_u16 null, v2 -// GFX12: v_cmpx_gt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x78,0x7d] +v_cmpx_gt_u16 0xfe0b, v127.l +// GFX12: v_cmpx_gt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_gt_u16 -1, v2 -// GFX12: v_cmpx_gt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x78,0x7d] +v_cmpx_gt_u16 v1.h, v2.l +// GFX12: v_cmpx_gt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x78,0x7d] -v_cmpx_gt_u16 0.5, v2 -// GFX12: v_cmpx_gt_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x78,0x7d] +v_cmpx_gt_u16 v127.h, v2.l +// GFX12: v_cmpx_gt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x78,0x7d] -v_cmpx_gt_u16 src_scc, v2 -// GFX12: v_cmpx_gt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x78,0x7d] +v_cmpx_gt_u16 src_scc, v2.h +// GFX12: v_cmpx_gt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x79,0x7d] -v_cmpx_gt_u16 0xfe0b, v127 -// GFX12: v_cmpx_gt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_u16 0xfe0b, v127.h +// GFX12: v_cmpx_gt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_gt_u32 v1, v2 // GFX12: v_cmpx_gt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x98,0x7d] @@ -1400,50 +1472,62 @@ v_cmpx_le_f64 src_scc, v[2:3] v_cmpx_le_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_le_i16 v1, v2 -// GFX12: v_cmpx_le_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x66,0x7d] +v_cmpx_le_i16 v1.l, v2.l +// GFX12: v_cmpx_le_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x66,0x7d] -v_cmpx_le_i16 v127, v2 -// GFX12: v_cmpx_le_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x66,0x7d] +v_cmpx_le_i16 v127.l, v2.l +// GFX12: v_cmpx_le_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x66,0x7d] -v_cmpx_le_i16 s1, v2 -// GFX12: v_cmpx_le_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x66,0x7d] +v_cmpx_le_i16 s1, v2.l +// GFX12: v_cmpx_le_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x66,0x7d] -v_cmpx_le_i16 s105, v2 -// GFX12: v_cmpx_le_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x66,0x7d] +v_cmpx_le_i16 s105, v2.l +// GFX12: v_cmpx_le_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x66,0x7d] -v_cmpx_le_i16 vcc_lo, v2 -// GFX12: v_cmpx_le_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x66,0x7d] +v_cmpx_le_i16 vcc_lo, v2.l +// GFX12: v_cmpx_le_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x66,0x7d] -v_cmpx_le_i16 vcc_hi, v2 -// GFX12: v_cmpx_le_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x66,0x7d] +v_cmpx_le_i16 vcc_hi, v2.l +// GFX12: v_cmpx_le_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x66,0x7d] -v_cmpx_le_i16 ttmp15, v2 -// GFX12: v_cmpx_le_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x66,0x7d] +v_cmpx_le_i16 ttmp15, v2.l +// GFX12: v_cmpx_le_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x66,0x7d] -v_cmpx_le_i16 m0, v2 -// GFX12: v_cmpx_le_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x66,0x7d] +v_cmpx_le_i16 m0, v2.l +// GFX12: v_cmpx_le_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x66,0x7d] -v_cmpx_le_i16 exec_lo, v2 -// GFX12: v_cmpx_le_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x66,0x7d] +v_cmpx_le_i16 exec_lo, v2.l +// GFX12: v_cmpx_le_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x66,0x7d] -v_cmpx_le_i16 exec_hi, v2 -// GFX12: v_cmpx_le_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x66,0x7d] +v_cmpx_le_i16 exec_hi, v2.l +// GFX12: v_cmpx_le_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x66,0x7d] -v_cmpx_le_i16 null, v2 -// GFX12: v_cmpx_le_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x66,0x7d] +v_cmpx_le_i16 null, v2.l +// GFX12: v_cmpx_le_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x66,0x7d] -v_cmpx_le_i16 -1, v2 -// GFX12: v_cmpx_le_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x66,0x7d] +v_cmpx_le_i16 -1, v2.l +// GFX12: v_cmpx_le_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x66,0x7d] -v_cmpx_le_i16 0.5, v2 -// GFX12: v_cmpx_le_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x66,0x7d] +v_cmpx_le_i16 0.5, v2.l +// GFX12: v_cmpx_le_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x66,0x7d] -v_cmpx_le_i16 src_scc, v2 -// GFX12: v_cmpx_le_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x66,0x7d] +v_cmpx_le_i16 src_scc, v2.l +// GFX12: v_cmpx_le_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x66,0x7d] -v_cmpx_le_i16 0xfe0b, v127 -// GFX12: v_cmpx_le_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_le_i16 0xfe0b, v127.l +// GFX12: v_cmpx_le_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_le_i16 v1.h, v2.l +// GFX12: v_cmpx_le_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x66,0x7d] + +v_cmpx_le_i16 v127.h, v2.l +// GFX12: v_cmpx_le_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x66,0x7d] + +v_cmpx_le_i16 src_scc, v2.h +// GFX12: v_cmpx_le_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x67,0x7d] + +v_cmpx_le_i16 0xfe0b, v127.h +// GFX12: v_cmpx_le_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_le_i32 v1, v2 // GFX12: v_cmpx_le_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x86,0x7d] @@ -1526,50 +1610,62 @@ v_cmpx_le_i64 src_scc, v[2:3] v_cmpx_le_i64 0xaf123456, v[254:255] // GFX12: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_le_u16 v1, v2 -// GFX12: v_cmpx_le_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x76,0x7d] +v_cmpx_le_u16 v1.l, v2.l +// GFX12: v_cmpx_le_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x76,0x7d] + +v_cmpx_le_u16 v127.l, v2.l +// GFX12: v_cmpx_le_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x76,0x7d] + +v_cmpx_le_u16 s1, v2.l +// GFX12: v_cmpx_le_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x76,0x7d] -v_cmpx_le_u16 v127, v2 -// GFX12: v_cmpx_le_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x76,0x7d] +v_cmpx_le_u16 s105, v2.l +// GFX12: v_cmpx_le_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x76,0x7d] -v_cmpx_le_u16 s1, v2 -// GFX12: v_cmpx_le_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x76,0x7d] +v_cmpx_le_u16 vcc_lo, v2.l +// GFX12: v_cmpx_le_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x76,0x7d] -v_cmpx_le_u16 s105, v2 -// GFX12: v_cmpx_le_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x76,0x7d] +v_cmpx_le_u16 vcc_hi, v2.l +// GFX12: v_cmpx_le_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x76,0x7d] -v_cmpx_le_u16 vcc_lo, v2 -// GFX12: v_cmpx_le_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x76,0x7d] +v_cmpx_le_u16 ttmp15, v2.l +// GFX12: v_cmpx_le_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x76,0x7d] -v_cmpx_le_u16 vcc_hi, v2 -// GFX12: v_cmpx_le_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x76,0x7d] +v_cmpx_le_u16 m0, v2.l +// GFX12: v_cmpx_le_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x76,0x7d] -v_cmpx_le_u16 ttmp15, v2 -// GFX12: v_cmpx_le_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x76,0x7d] +v_cmpx_le_u16 exec_lo, v2.l +// GFX12: v_cmpx_le_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x76,0x7d] -v_cmpx_le_u16 m0, v2 -// GFX12: v_cmpx_le_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x76,0x7d] +v_cmpx_le_u16 exec_hi, v2.l +// GFX12: v_cmpx_le_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x76,0x7d] -v_cmpx_le_u16 exec_lo, v2 -// GFX12: v_cmpx_le_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x76,0x7d] +v_cmpx_le_u16 null, v2.l +// GFX12: v_cmpx_le_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x76,0x7d] -v_cmpx_le_u16 exec_hi, v2 -// GFX12: v_cmpx_le_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x76,0x7d] +v_cmpx_le_u16 -1, v2.l +// GFX12: v_cmpx_le_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x76,0x7d] -v_cmpx_le_u16 null, v2 -// GFX12: v_cmpx_le_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x76,0x7d] +v_cmpx_le_u16 0.5, v2.l +// GFX12: v_cmpx_le_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x76,0x7d] -v_cmpx_le_u16 -1, v2 -// GFX12: v_cmpx_le_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x76,0x7d] +v_cmpx_le_u16 src_scc, v2.l +// GFX12: v_cmpx_le_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x76,0x7d] -v_cmpx_le_u16 0.5, v2 -// GFX12: v_cmpx_le_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x76,0x7d] +v_cmpx_le_u16 0xfe0b, v127.l +// GFX12: v_cmpx_le_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_le_u16 src_scc, v2 -// GFX12: v_cmpx_le_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x76,0x7d] +v_cmpx_le_u16 v1.h, v2.l +// GFX12: v_cmpx_le_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x76,0x7d] -v_cmpx_le_u16 0xfe0b, v127 -// GFX12: v_cmpx_le_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_le_u16 v127.h, v2.l +// GFX12: v_cmpx_le_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x76,0x7d] + +v_cmpx_le_u16 src_scc, v2.h +// GFX12: v_cmpx_le_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x77,0x7d] + +v_cmpx_le_u16 0xfe0b, v127.h +// GFX12: v_cmpx_le_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_le_u32 v1, v2 // GFX12: v_cmpx_le_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x96,0x7d] @@ -1916,50 +2012,62 @@ v_cmpx_lt_f64 src_scc, v[2:3] v_cmpx_lt_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_lt_i16 v1, v2 -// GFX12: v_cmpx_lt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x62,0x7d] +v_cmpx_lt_i16 v1.l, v2.l +// GFX12: v_cmpx_lt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x62,0x7d] + +v_cmpx_lt_i16 v127.l, v2.l +// GFX12: v_cmpx_lt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x62,0x7d] + +v_cmpx_lt_i16 s1, v2.l +// GFX12: v_cmpx_lt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x62,0x7d] + +v_cmpx_lt_i16 s105, v2.l +// GFX12: v_cmpx_lt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x62,0x7d] + +v_cmpx_lt_i16 vcc_lo, v2.l +// GFX12: v_cmpx_lt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x62,0x7d] -v_cmpx_lt_i16 v127, v2 -// GFX12: v_cmpx_lt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x62,0x7d] +v_cmpx_lt_i16 vcc_hi, v2.l +// GFX12: v_cmpx_lt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x62,0x7d] -v_cmpx_lt_i16 s1, v2 -// GFX12: v_cmpx_lt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x62,0x7d] +v_cmpx_lt_i16 ttmp15, v2.l +// GFX12: v_cmpx_lt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x62,0x7d] -v_cmpx_lt_i16 s105, v2 -// GFX12: v_cmpx_lt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x62,0x7d] +v_cmpx_lt_i16 m0, v2.l +// GFX12: v_cmpx_lt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x62,0x7d] -v_cmpx_lt_i16 vcc_lo, v2 -// GFX12: v_cmpx_lt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x62,0x7d] +v_cmpx_lt_i16 exec_lo, v2.l +// GFX12: v_cmpx_lt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x62,0x7d] -v_cmpx_lt_i16 vcc_hi, v2 -// GFX12: v_cmpx_lt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x62,0x7d] +v_cmpx_lt_i16 exec_hi, v2.l +// GFX12: v_cmpx_lt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x62,0x7d] -v_cmpx_lt_i16 ttmp15, v2 -// GFX12: v_cmpx_lt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x62,0x7d] +v_cmpx_lt_i16 null, v2.l +// GFX12: v_cmpx_lt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x62,0x7d] -v_cmpx_lt_i16 m0, v2 -// GFX12: v_cmpx_lt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x62,0x7d] +v_cmpx_lt_i16 -1, v2.l +// GFX12: v_cmpx_lt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x62,0x7d] -v_cmpx_lt_i16 exec_lo, v2 -// GFX12: v_cmpx_lt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x62,0x7d] +v_cmpx_lt_i16 0.5, v2.l +// GFX12: v_cmpx_lt_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x62,0x7d] -v_cmpx_lt_i16 exec_hi, v2 -// GFX12: v_cmpx_lt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x62,0x7d] +v_cmpx_lt_i16 src_scc, v2.l +// GFX12: v_cmpx_lt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x62,0x7d] -v_cmpx_lt_i16 null, v2 -// GFX12: v_cmpx_lt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x62,0x7d] +v_cmpx_lt_i16 0xfe0b, v127.l +// GFX12: v_cmpx_lt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_lt_i16 -1, v2 -// GFX12: v_cmpx_lt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x62,0x7d] +v_cmpx_lt_i16 v1.h, v2.l +// GFX12: v_cmpx_lt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x62,0x7d] -v_cmpx_lt_i16 0.5, v2 -// GFX12: v_cmpx_lt_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x62,0x7d] +v_cmpx_lt_i16 v127.h, v2.l +// GFX12: v_cmpx_lt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x62,0x7d] -v_cmpx_lt_i16 src_scc, v2 -// GFX12: v_cmpx_lt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x62,0x7d] +v_cmpx_lt_i16 src_scc, v2.h +// GFX12: v_cmpx_lt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x63,0x7d] -v_cmpx_lt_i16 0xfe0b, v127 -// GFX12: v_cmpx_lt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_i16 0xfe0b, v127.h +// GFX12: v_cmpx_lt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_lt_i32 v1, v2 // GFX12: v_cmpx_lt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x82,0x7d] @@ -2042,50 +2150,62 @@ v_cmpx_lt_i64 src_scc, v[2:3] v_cmpx_lt_i64 0xaf123456, v[254:255] // GFX12: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_lt_u16 v1, v2 -// GFX12: v_cmpx_lt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x72,0x7d] +v_cmpx_lt_u16 v1.l, v2.l +// GFX12: v_cmpx_lt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x72,0x7d] -v_cmpx_lt_u16 v127, v2 -// GFX12: v_cmpx_lt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x72,0x7d] +v_cmpx_lt_u16 v127.l, v2.l +// GFX12: v_cmpx_lt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x72,0x7d] -v_cmpx_lt_u16 s1, v2 -// GFX12: v_cmpx_lt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x72,0x7d] +v_cmpx_lt_u16 s1, v2.l +// GFX12: v_cmpx_lt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x72,0x7d] -v_cmpx_lt_u16 s105, v2 -// GFX12: v_cmpx_lt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x72,0x7d] +v_cmpx_lt_u16 s105, v2.l +// GFX12: v_cmpx_lt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x72,0x7d] -v_cmpx_lt_u16 vcc_lo, v2 -// GFX12: v_cmpx_lt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x72,0x7d] +v_cmpx_lt_u16 vcc_lo, v2.l +// GFX12: v_cmpx_lt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x72,0x7d] -v_cmpx_lt_u16 vcc_hi, v2 -// GFX12: v_cmpx_lt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x72,0x7d] +v_cmpx_lt_u16 vcc_hi, v2.l +// GFX12: v_cmpx_lt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x72,0x7d] -v_cmpx_lt_u16 ttmp15, v2 -// GFX12: v_cmpx_lt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x72,0x7d] +v_cmpx_lt_u16 ttmp15, v2.l +// GFX12: v_cmpx_lt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x72,0x7d] -v_cmpx_lt_u16 m0, v2 -// GFX12: v_cmpx_lt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x72,0x7d] +v_cmpx_lt_u16 m0, v2.l +// GFX12: v_cmpx_lt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x72,0x7d] -v_cmpx_lt_u16 exec_lo, v2 -// GFX12: v_cmpx_lt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x72,0x7d] +v_cmpx_lt_u16 exec_lo, v2.l +// GFX12: v_cmpx_lt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x72,0x7d] -v_cmpx_lt_u16 exec_hi, v2 -// GFX12: v_cmpx_lt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x72,0x7d] +v_cmpx_lt_u16 exec_hi, v2.l +// GFX12: v_cmpx_lt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x72,0x7d] -v_cmpx_lt_u16 null, v2 -// GFX12: v_cmpx_lt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x72,0x7d] +v_cmpx_lt_u16 null, v2.l +// GFX12: v_cmpx_lt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x72,0x7d] -v_cmpx_lt_u16 -1, v2 -// GFX12: v_cmpx_lt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x72,0x7d] +v_cmpx_lt_u16 -1, v2.l +// GFX12: v_cmpx_lt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x72,0x7d] -v_cmpx_lt_u16 0.5, v2 -// GFX12: v_cmpx_lt_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x72,0x7d] +v_cmpx_lt_u16 0.5, v2.l +// GFX12: v_cmpx_lt_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x72,0x7d] -v_cmpx_lt_u16 src_scc, v2 -// GFX12: v_cmpx_lt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x72,0x7d] +v_cmpx_lt_u16 src_scc, v2.l +// GFX12: v_cmpx_lt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x72,0x7d] -v_cmpx_lt_u16 0xfe0b, v127 -// GFX12: v_cmpx_lt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_lt_u16 0xfe0b, v127.l +// GFX12: v_cmpx_lt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_lt_u16 v1.h, v2.l +// GFX12: v_cmpx_lt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x72,0x7d] + +v_cmpx_lt_u16 v127.h, v2.l +// GFX12: v_cmpx_lt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x72,0x7d] + +v_cmpx_lt_u16 src_scc, v2.h +// GFX12: v_cmpx_lt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x73,0x7d] + +v_cmpx_lt_u16 0xfe0b, v127.h +// GFX12: v_cmpx_lt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_lt_u32 v1, v2 // GFX12: v_cmpx_lt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x92,0x7d] @@ -2168,50 +2288,62 @@ v_cmpx_lt_u64 src_scc, v[2:3] v_cmpx_lt_u64 0xaf123456, v[254:255] // GFX12: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ne_i16 v1, v2 -// GFX12: v_cmpx_ne_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6a,0x7d] +v_cmpx_ne_i16 v1.l, v2.l +// GFX12: v_cmpx_ne_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6a,0x7d] + +v_cmpx_ne_i16 v127.l, v2.l +// GFX12: v_cmpx_ne_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6a,0x7d] + +v_cmpx_ne_i16 s1, v2.l +// GFX12: v_cmpx_ne_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6a,0x7d] -v_cmpx_ne_i16 v127, v2 -// GFX12: v_cmpx_ne_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6a,0x7d] +v_cmpx_ne_i16 s105, v2.l +// GFX12: v_cmpx_ne_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6a,0x7d] -v_cmpx_ne_i16 s1, v2 -// GFX12: v_cmpx_ne_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6a,0x7d] +v_cmpx_ne_i16 vcc_lo, v2.l +// GFX12: v_cmpx_ne_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6a,0x7d] -v_cmpx_ne_i16 s105, v2 -// GFX12: v_cmpx_ne_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6a,0x7d] +v_cmpx_ne_i16 vcc_hi, v2.l +// GFX12: v_cmpx_ne_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6a,0x7d] -v_cmpx_ne_i16 vcc_lo, v2 -// GFX12: v_cmpx_ne_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6a,0x7d] +v_cmpx_ne_i16 ttmp15, v2.l +// GFX12: v_cmpx_ne_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6a,0x7d] -v_cmpx_ne_i16 vcc_hi, v2 -// GFX12: v_cmpx_ne_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6a,0x7d] +v_cmpx_ne_i16 m0, v2.l +// GFX12: v_cmpx_ne_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6a,0x7d] -v_cmpx_ne_i16 ttmp15, v2 -// GFX12: v_cmpx_ne_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6a,0x7d] +v_cmpx_ne_i16 exec_lo, v2.l +// GFX12: v_cmpx_ne_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6a,0x7d] -v_cmpx_ne_i16 m0, v2 -// GFX12: v_cmpx_ne_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6a,0x7d] +v_cmpx_ne_i16 exec_hi, v2.l +// GFX12: v_cmpx_ne_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6a,0x7d] -v_cmpx_ne_i16 exec_lo, v2 -// GFX12: v_cmpx_ne_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6a,0x7d] +v_cmpx_ne_i16 null, v2.l +// GFX12: v_cmpx_ne_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6a,0x7d] -v_cmpx_ne_i16 exec_hi, v2 -// GFX12: v_cmpx_ne_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6a,0x7d] +v_cmpx_ne_i16 -1, v2.l +// GFX12: v_cmpx_ne_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6a,0x7d] -v_cmpx_ne_i16 null, v2 -// GFX12: v_cmpx_ne_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6a,0x7d] +v_cmpx_ne_i16 0.5, v2.l +// GFX12: v_cmpx_ne_i16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x6a,0x7d] -v_cmpx_ne_i16 -1, v2 -// GFX12: v_cmpx_ne_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6a,0x7d] +v_cmpx_ne_i16 src_scc, v2.l +// GFX12: v_cmpx_ne_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6a,0x7d] -v_cmpx_ne_i16 0.5, v2 -// GFX12: v_cmpx_ne_i16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x6a,0x7d] +v_cmpx_ne_i16 0xfe0b, v127.l +// GFX12: v_cmpx_ne_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ne_i16 src_scc, v2 -// GFX12: v_cmpx_ne_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6a,0x7d] +v_cmpx_ne_i16 v1.h, v2.l +// GFX12: v_cmpx_ne_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6a,0x7d] -v_cmpx_ne_i16 0xfe0b, v127 -// GFX12: v_cmpx_ne_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_i16 v127.h, v2.l +// GFX12: v_cmpx_ne_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6a,0x7d] + +v_cmpx_ne_i16 src_scc, v2.h +// GFX12: v_cmpx_ne_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6b,0x7d] + +v_cmpx_ne_i16 0xfe0b, v127.h +// GFX12: v_cmpx_ne_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ne_i32 v1, v2 // GFX12: v_cmpx_ne_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8a,0x7d] @@ -2294,50 +2426,62 @@ v_cmpx_ne_i64 src_scc, v[2:3] v_cmpx_ne_i64 0xaf123456, v[254:255] // GFX12: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ne_u16 v1, v2 -// GFX12: v_cmpx_ne_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7a,0x7d] +v_cmpx_ne_u16 v1.l, v2.l +// GFX12: v_cmpx_ne_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7a,0x7d] + +v_cmpx_ne_u16 v127.l, v2.l +// GFX12: v_cmpx_ne_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7a,0x7d] + +v_cmpx_ne_u16 s1, v2.l +// GFX12: v_cmpx_ne_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7a,0x7d] + +v_cmpx_ne_u16 s105, v2.l +// GFX12: v_cmpx_ne_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7a,0x7d] + +v_cmpx_ne_u16 vcc_lo, v2.l +// GFX12: v_cmpx_ne_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7a,0x7d] -v_cmpx_ne_u16 v127, v2 -// GFX12: v_cmpx_ne_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7a,0x7d] +v_cmpx_ne_u16 vcc_hi, v2.l +// GFX12: v_cmpx_ne_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7a,0x7d] -v_cmpx_ne_u16 s1, v2 -// GFX12: v_cmpx_ne_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7a,0x7d] +v_cmpx_ne_u16 ttmp15, v2.l +// GFX12: v_cmpx_ne_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7a,0x7d] -v_cmpx_ne_u16 s105, v2 -// GFX12: v_cmpx_ne_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7a,0x7d] +v_cmpx_ne_u16 m0, v2.l +// GFX12: v_cmpx_ne_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7a,0x7d] -v_cmpx_ne_u16 vcc_lo, v2 -// GFX12: v_cmpx_ne_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7a,0x7d] +v_cmpx_ne_u16 exec_lo, v2.l +// GFX12: v_cmpx_ne_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7a,0x7d] -v_cmpx_ne_u16 vcc_hi, v2 -// GFX12: v_cmpx_ne_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7a,0x7d] +v_cmpx_ne_u16 exec_hi, v2.l +// GFX12: v_cmpx_ne_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7a,0x7d] -v_cmpx_ne_u16 ttmp15, v2 -// GFX12: v_cmpx_ne_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7a,0x7d] +v_cmpx_ne_u16 null, v2.l +// GFX12: v_cmpx_ne_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7a,0x7d] -v_cmpx_ne_u16 m0, v2 -// GFX12: v_cmpx_ne_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7a,0x7d] +v_cmpx_ne_u16 -1, v2.l +// GFX12: v_cmpx_ne_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7a,0x7d] -v_cmpx_ne_u16 exec_lo, v2 -// GFX12: v_cmpx_ne_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7a,0x7d] +v_cmpx_ne_u16 0.5, v2.l +// GFX12: v_cmpx_ne_u16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x7a,0x7d] -v_cmpx_ne_u16 exec_hi, v2 -// GFX12: v_cmpx_ne_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7a,0x7d] +v_cmpx_ne_u16 src_scc, v2.l +// GFX12: v_cmpx_ne_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7a,0x7d] -v_cmpx_ne_u16 null, v2 -// GFX12: v_cmpx_ne_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7a,0x7d] +v_cmpx_ne_u16 0xfe0b, v127.l +// GFX12: v_cmpx_ne_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ne_u16 -1, v2 -// GFX12: v_cmpx_ne_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7a,0x7d] +v_cmpx_ne_u16 v1.h, v2.l +// GFX12: v_cmpx_ne_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7a,0x7d] -v_cmpx_ne_u16 0.5, v2 -// GFX12: v_cmpx_ne_u16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x7a,0x7d] +v_cmpx_ne_u16 v127.h, v2.l +// GFX12: v_cmpx_ne_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7a,0x7d] -v_cmpx_ne_u16 src_scc, v2 -// GFX12: v_cmpx_ne_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7a,0x7d] +v_cmpx_ne_u16 src_scc, v2.h +// GFX12: v_cmpx_ne_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7b,0x7d] -v_cmpx_ne_u16 0xfe0b, v127 -// GFX12: v_cmpx_ne_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ne_u16 0xfe0b, v127.h +// GFX12: v_cmpx_ne_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ne_u32 v1, v2 // GFX12: v_cmpx_ne_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9a,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s index 97f56535364c75..5c54d1ad5788c3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s @@ -176,47 +176,53 @@ v_cmpx_eq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_eq_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_eq_i16 v1, v2 row_mirror -// GFX12: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_half_mirror -// GFX12: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shl:1 -// GFX12: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shl:15 -// GFX12: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shr:1 -// GFX12: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_shr:15 -// GFX12: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_ror:1 -// GFX12: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_ror:15 -// GFX12: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_eq_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_eq_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_eq_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_eq_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_eq_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_eq_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x65,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_eq_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff] @@ -260,47 +266,53 @@ v_cmpx_eq_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_eq_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_mirror -// GFX12: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_half_mirror -// GFX12: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shl:1 -// GFX12: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shl:15 -// GFX12: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shr:1 -// GFX12: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_shr:15 -// GFX12: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_ror:1 -// GFX12: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_ror:15 -// GFX12: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_eq_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_eq_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_eq_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_eq_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_eq_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x75,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_eq_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff] @@ -428,47 +440,53 @@ v_cmpx_ge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_ge_i16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_mirror -// GFX12: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_half_mirror -// GFX12: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shl:1 -// GFX12: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shl:15 -// GFX12: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shr:1 -// GFX12: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_shr:15 -// GFX12: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_ror:1 -// GFX12: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ge_i16 v1, v2 row_ror:15 -// GFX12: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ge_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ge_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_ge_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ge_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6d,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ge_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff] @@ -512,47 +530,53 @@ v_cmpx_ge_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ge_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_ge_u16 v1, v2 row_mirror -// GFX12: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_half_mirror -// GFX12: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shl:1 -// GFX12: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shl:15 -// GFX12: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shr:1 -// GFX12: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_shr:15 -// GFX12: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_ror:1 -// GFX12: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_ror:15 -// GFX12: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ge_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ge_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ge_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ge_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_ge_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_ge_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7d,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_ge_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff] @@ -680,47 +704,53 @@ v_cmpx_gt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_gt_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_mirror -// GFX12: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_half_mirror -// GFX12: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shl:1 -// GFX12: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shl:15 -// GFX12: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shr:1 -// GFX12: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_shr:15 -// GFX12: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_ror:1 -// GFX12: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_ror:15 -// GFX12: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_gt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_gt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_gt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_gt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_gt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x69,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_gt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff] @@ -764,47 +794,53 @@ v_cmpx_gt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_gt_u16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_mirror -// GFX12: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_half_mirror -// GFX12: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shl:1 -// GFX12: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shl:15 -// GFX12: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shr:1 -// GFX12: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_shr:15 -// GFX12: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_ror:1 -// GFX12: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_gt_u16 v1, v2 row_ror:15 -// GFX12: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_gt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_gt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_gt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_gt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x79,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_gt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff] @@ -932,47 +968,53 @@ v_cmpx_le_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_le_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_le_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_le_i16 v1, v2 row_mirror -// GFX12: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_half_mirror -// GFX12: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shl:1 -// GFX12: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shl:15 -// GFX12: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shr:1 -// GFX12: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_shr:15 -// GFX12: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_ror:1 -// GFX12: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_ror:15 -// GFX12: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_le_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_le_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_le_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_le_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_le_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_le_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_le_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x67,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_le_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff] @@ -1016,47 +1058,53 @@ v_cmpx_le_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_le_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_le_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_le_u16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_mirror -// GFX12: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_half_mirror -// GFX12: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shl:1 -// GFX12: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shl:15 -// GFX12: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shr:1 -// GFX12: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_shr:15 -// GFX12: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_ror:1 -// GFX12: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_ror:15 -// GFX12: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_le_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_le_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_le_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_le_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_le_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_le_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x77,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_le_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff] @@ -1274,47 +1322,53 @@ v_cmpx_lt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_lt_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_lt_i16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_mirror -// GFX12: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_half_mirror -// GFX12: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shl:1 -// GFX12: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shl:15 -// GFX12: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shr:1 -// GFX12: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_shr:15 -// GFX12: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_ror:1 -// GFX12: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_lt_i16 v1, v2 row_ror:15 -// GFX12: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_lt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_lt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_lt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_lt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x63,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_lt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff] @@ -1358,47 +1412,53 @@ v_cmpx_lt_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_lt_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_lt_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_lt_u16 v1, v2 row_mirror -// GFX12: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_half_mirror -// GFX12: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shl:1 -// GFX12: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shl:15 -// GFX12: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shr:1 -// GFX12: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_shr:15 -// GFX12: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_ror:1 -// GFX12: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_ror:15 -// GFX12: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_lt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_lt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_lt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_lt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_lt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x05,0x30] + +v_cmpx_lt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x73,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_lt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff] @@ -1442,47 +1502,53 @@ v_cmpx_lt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_i16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_mirror -// GFX12: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_half_mirror -// GFX12: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shl:1 -// GFX12: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shl:15 -// GFX12: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shr:1 -// GFX12: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_shr:15 -// GFX12: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_ror:1 -// GFX12: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_ror:15 -// GFX12: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ne_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ne_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ne_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_ne_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ne_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ne_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x6b,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_ne_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ne_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff] @@ -1526,47 +1592,53 @@ v_cmpx_ne_i32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ne_u16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_ne_u16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_mirror -// GFX12: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_half_mirror -// GFX12: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shl:1 -// GFX12: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shl:15 -// GFX12: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shr:1 -// GFX12: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_shr:15 -// GFX12: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_ror:1 -// GFX12: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ne_u16 v1, v2 row_ror:15 -// GFX12: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ne_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ne_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_ne_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x05,0x30] -v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ne_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ne_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x7b,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x05,0x30] +v_cmpx_ne_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ne_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x05,0x30] v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s index 834c89dd30cddb..c6e7fd1aa96da0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s @@ -44,14 +44,20 @@ v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_eq_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_eq_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_eq_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_eq_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05] @@ -62,14 +68,20 @@ v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_eq_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_eq_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_eq_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05] @@ -98,14 +110,20 @@ v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_ge_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ge_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05] @@ -116,14 +134,20 @@ v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ge_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_ge_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ge_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_ge_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05] @@ -152,14 +176,20 @@ v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_gt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_gt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_gt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05] @@ -170,14 +200,20 @@ v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_gt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_gt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05] @@ -206,14 +242,20 @@ v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_le_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_le_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_le_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_le_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05] @@ -224,14 +266,20 @@ v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_le_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_le_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_le_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05] @@ -284,14 +332,20 @@ v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_lt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_lt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05] @@ -302,14 +356,20 @@ v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_lt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_lt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_lt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_lt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05] @@ -320,14 +380,20 @@ v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_ne_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ne_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_ne_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ne_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05] @@ -338,14 +404,20 @@ v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ne_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_ne_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ne_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ne_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ne_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s index 39afbf4d47be08..7ba3aff6c80ca8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s @@ -55,41 +55,77 @@ v_cmpx_eq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_i16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_eq_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_ge_f16_e32 v1, v255 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -109,41 +145,77 @@ v_cmpx_ge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_i16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_gt_f16_e32 v1, v255 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -163,41 +235,77 @@ v_cmpx_gt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_i16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_le_f16_e32 v1, v255 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -217,41 +325,77 @@ v_cmpx_le_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction -v_cmpx_le_i16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_u16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_i16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_lg_f16_e32 v1, v255 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -307,77 +451,149 @@ v_cmpx_lt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_lt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_i16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lt_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_i16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lt_u16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ne_u16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_i16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ne_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_neq_f16_e32 v1, v255 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s index ca84ac51f6dd86..b7423dcde03d4f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s @@ -55,41 +55,77 @@ v_cmpx_eq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_f16 v255, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_eq_i16 v1, v255 -// GFX12: v_cmpx_eq_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_eq_i16 v1.h, v255.h +// GFX12: v_cmpx_eq_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_i16 v255, v2 -// GFX12: v_cmpx_eq_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_eq_i16 v1.l, v255.l +// GFX12: v_cmpx_eq_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_eq_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_eq_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_u16 v1, v255 -// GFX12: v_cmpx_eq_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_eq_i16 v255.h, v2.h +// GFX12: v_cmpx_eq_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb2,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_eq_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_eq_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_eq_u16 v255, v2 -// GFX12: v_cmpx_eq_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_eq_i16 v255.l, v2.l +// GFX12: v_cmpx_eq_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_eq_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_eq_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_eq_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_eq_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v1.h, v255.h +// GFX12: v_cmpx_eq_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xba,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_eq_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v1.l, v255.l +// GFX12: v_cmpx_eq_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_eq_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_eq_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v255.h, v2.h +// GFX12: v_cmpx_eq_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xba,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_eq_u16 v255.l, v2.l +// GFX12: v_cmpx_eq_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_f16 v1, v255 // GFX12: v_cmpx_ge_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00] @@ -109,41 +145,77 @@ v_cmpx_ge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_f16 v255, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_i16 v1, v255 -// GFX12: v_cmpx_ge_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ge_i16 v1.h, v255.h +// GFX12: v_cmpx_ge_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ge_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16 v1.l, v255.l +// GFX12: v_cmpx_ge_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ge_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_i16 v255.h, v2.h +// GFX12: v_cmpx_ge_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb6,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ge_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ge_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_i16 v255, v2 -// GFX12: v_cmpx_ge_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ge_i16 v255.l, v2.l +// GFX12: v_cmpx_ge_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ge_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ge_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ge_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ge_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_u16 v1, v255 -// GFX12: v_cmpx_ge_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ge_u16 v1.h, v255.h +// GFX12: v_cmpx_ge_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbe,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ge_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_u16 v255, v2 -// GFX12: v_cmpx_ge_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ge_u16 v1.l, v255.l +// GFX12: v_cmpx_ge_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ge_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ge_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ge_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_u16 v255.h, v2.h +// GFX12: v_cmpx_ge_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbe,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ge_u16 v255.l, v2.l +// GFX12: v_cmpx_ge_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_f16 v1, v255 // GFX12: v_cmpx_gt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00] @@ -163,41 +235,77 @@ v_cmpx_gt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_f16 v255, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_gt_i16 v1, v255 -// GFX12: v_cmpx_gt_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_gt_i16 v1.h, v255.h +// GFX12: v_cmpx_gt_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_gt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v1.l, v255.l +// GFX12: v_cmpx_gt_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_gt_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v255.h, v2.h +// GFX12: v_cmpx_gt_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb4,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_gt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_gt_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_gt_i16 v255.l, v2.l +// GFX12: v_cmpx_gt_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_gt_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_gt_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_gt_u16 v1.h, v255.h +// GFX12: v_cmpx_gt_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbc,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_gt_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_i16 v255, v2 -// GFX12: v_cmpx_gt_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_gt_u16 v1.l, v255.l +// GFX12: v_cmpx_gt_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_gt_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_gt_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_gt_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_u16 v1, v255 -// GFX12: v_cmpx_gt_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_gt_u16 v255.h, v2.h +// GFX12: v_cmpx_gt_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbc,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_gt_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_gt_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_gt_u16 v255, v2 -// GFX12: v_cmpx_gt_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_gt_u16 v255.l, v2.l +// GFX12: v_cmpx_gt_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_gt_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_gt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_gt_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_f16 v1, v255 // GFX12: v_cmpx_le_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00] @@ -217,41 +325,77 @@ v_cmpx_le_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_f16 v255, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_i16 v1, v255 -// GFX12: v_cmpx_le_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_le_i16 v1.h, v255.h +// GFX12: v_cmpx_le_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_le_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_i16 v255, v2 -// GFX12: v_cmpx_le_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_le_i16 v1.l, v255.l +// GFX12: v_cmpx_le_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_le_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_le_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_le_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_le_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_u16 v1, v255 -// GFX12: v_cmpx_le_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_le_i16 v255.h, v2.h +// GFX12: v_cmpx_le_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb3,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_le_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_u16 v255, v2 -// GFX12: v_cmpx_le_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_le_i16 v255.l, v2.l +// GFX12: v_cmpx_le_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_le_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_le_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_le_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_le_u16 v1.h, v255.h +// GFX12: v_cmpx_le_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbb,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16 v1.l, v255.l +// GFX12: v_cmpx_le_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_u16 v255.h, v2.h +// GFX12: v_cmpx_le_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbb,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_le_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_le_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_le_u16 v255.l, v2.l +// GFX12: v_cmpx_le_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_le_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lg_f16 v1, v255 // GFX12: v_cmpx_lg_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00] @@ -307,77 +451,149 @@ v_cmpx_lt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_lt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_lt_i16 v1, v255 -// GFX12: v_cmpx_lt_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_lt_i16 v1.h, v255.h +// GFX12: v_cmpx_lt_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb1,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v1.l, v255.l +// GFX12: v_cmpx_lt_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v255.h, v2.h +// GFX12: v_cmpx_lt_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb1,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_lt_i16 v255.l, v2.l +// GFX12: v_cmpx_lt_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v1.h, v255.h +// GFX12: v_cmpx_lt_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb9,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v1.l, v255.l +// GFX12: v_cmpx_lt_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lt_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lt_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v255.h, v2.h +// GFX12: v_cmpx_lt_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb9,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_lt_u16 v255.l, v2.l +// GFX12: v_cmpx_lt_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lt_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ne_i16 v1.h, v255.h +// GFX12: v_cmpx_ne_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb5,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_lt_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_i16 v255, v2 -// GFX12: v_cmpx_lt_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_i16 v1.l, v255.l +// GFX12: v_cmpx_ne_i16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_lt_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_i16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lt_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_i16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lt_u16 v1, v255 -// GFX12: v_cmpx_lt_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ne_i16 v255.h, v2.h +// GFX12: v_cmpx_ne_i16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xb5,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_lt_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_lt_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_i16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_lt_u16 v255, v2 -// GFX12: v_cmpx_lt_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_i16 v255.l, v2.l +// GFX12: v_cmpx_ne_i16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_lt_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_i16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_lt_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lt_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_i16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ne_i16 v1, v255 -// GFX12: v_cmpx_ne_i16_e64 v1, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ne_u16 v1.h, v255.h +// GFX12: v_cmpx_ne_u16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xbd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ne_i16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ne_i16 v255, v2 -// GFX12: v_cmpx_ne_i16_e64 v255, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_u16 v1.l, v255.l +// GFX12: v_cmpx_ne_u16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ne_i16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_u16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ne_i16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_i16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_u16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ne_u16 v1, v255 -// GFX12: v_cmpx_ne_u16_e64 v1, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ne_u16 v255.h, v2.h +// GFX12: v_cmpx_ne_u16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0xbd,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ne_u16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ne_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ne_u16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ne_u16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ne_u16 v255, v2 -// GFX12: v_cmpx_ne_u16_e64 v255, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ne_u16 v255.l, v2.l +// GFX12: v_cmpx_ne_u16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ne_u16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ne_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ne_u16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ne_u16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_neq_f16 v1, v255 // GFX12: v_cmpx_neq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt index fea883471177f9..20250c1df729e6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt @@ -199,46 +199,72 @@ # GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -283,46 +309,72 @@ # GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -619,46 +671,72 @@ # GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -703,46 +781,72 @@ # GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -871,46 +975,72 @@ # GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -955,46 +1085,72 @@ # GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1123,46 +1279,72 @@ # GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1207,46 +1389,72 @@ # GFX11: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1485,46 +1693,72 @@ # GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1569,46 +1803,72 @@ # GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1653,46 +1913,72 @@ # GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1737,46 +2023,72 @@ # GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt index 826374f8f830ae..a1ef8f36e77be9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt @@ -55,10 +55,24 @@ # GFX11: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -67,10 +81,24 @@ # GFX11: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -139,10 +167,24 @@ # GFX11: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -151,10 +193,24 @@ # GFX11: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -187,10 +243,24 @@ # GFX11: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -199,10 +269,24 @@ # GFX11: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -235,10 +319,24 @@ # GFX11: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -247,10 +345,24 @@ # GFX11: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -323,10 +435,24 @@ # GFX11: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -335,10 +461,24 @@ # GFX11: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -347,10 +487,24 @@ # GFX11: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -359,10 +513,24 @@ # GFX11: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt index 281eb66be5a184..a3e9f92454e3a4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt @@ -286,10 +286,12 @@ # GFX11: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_eq_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_eq_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00] @@ -330,6 +332,14 @@ 0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_eq_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00] @@ -412,10 +422,12 @@ # GFX11: v_cmpx_eq_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_eq_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_eq_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00] @@ -456,6 +468,14 @@ 0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_eq_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00] @@ -952,10 +972,12 @@ # GFX11: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ge_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_ge_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00] @@ -996,6 +1018,14 @@ 0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_ge_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00] @@ -1078,10 +1108,12 @@ # GFX11: v_cmpx_ge_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ge_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_ge_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00] @@ -1122,6 +1154,14 @@ 0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_ge_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00] @@ -1330,10 +1370,12 @@ # GFX11: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_gt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_gt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00] @@ -1374,6 +1416,14 @@ 0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_gt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00] @@ -1456,10 +1506,12 @@ # GFX11: v_cmpx_gt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_gt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_gt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00] @@ -1500,6 +1552,14 @@ 0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_gt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00] @@ -1708,10 +1768,12 @@ # GFX11: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_le_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_le_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_le_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00] @@ -1752,6 +1814,14 @@ 0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_le_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00] @@ -1834,10 +1904,12 @@ # GFX11: v_cmpx_le_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_le_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_le_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_le_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00] @@ -1878,6 +1950,14 @@ 0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_le_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00] @@ -2222,10 +2302,12 @@ # GFX11: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_lt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_lt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00] @@ -2266,6 +2348,14 @@ 0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_lt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00] @@ -2348,10 +2438,12 @@ # GFX11: v_cmpx_lt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_lt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_lt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00] @@ -2392,6 +2484,14 @@ 0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_lt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00] @@ -2474,10 +2574,12 @@ # GFX11: v_cmpx_lt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ne_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_ne_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00] @@ -2518,6 +2620,14 @@ 0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_ne_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00] @@ -2600,10 +2710,12 @@ # GFX11: v_cmpx_ne_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ne_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_ne_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00] @@ -2644,6 +2756,14 @@ 0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_ne_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt index caabe1eecc1a06..f058a9b9816252 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt @@ -292,49 +292,84 @@ # GFX11: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x64,0x7d] 0x7f,0x05,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x64,0x7d] 0x01,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x64,0x7d] 0x69,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x64,0x7d] 0x6a,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x64,0x7d] 0x6b,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x64,0x7d] 0x7b,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x64,0x7d] 0x7d,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x64,0x7d] 0x7e,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x64,0x7d] 0x7f,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x64,0x7d] 0x7c,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x64,0x7d] 0xc1,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x64,0x7d] 0xf0,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x64,0x7d -# GFX11: v_cmpx_eq_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x64,0x7d] +# GFX11-REAL16: v_cmpx_eq_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x64,0x7d] 0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_eq_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x64,0x7d +# GFX11-REAL16: v_cmpx_eq_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x64,0x7d] + +0xff,0x05,0x64,0x7d +# GFX11-REAL16: v_cmpx_eq_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x64,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x64,0x7d] + +0xf0,0xfe,0x64,0x7d +# GFX11-REAL16: v_cmpx_eq_i16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x64,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x64,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x65,0x7d +# GFX11-REAL16: v_cmpx_eq_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x65,0x7d] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x65,0x7d] + +0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x84,0x7d # GFX11: v_cmpx_eq_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x84,0x7d] @@ -418,49 +453,84 @@ # GFX11: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x74,0x7d] 0x7f,0x05,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x74,0x7d] 0x01,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x74,0x7d] 0x69,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x74,0x7d] 0x6a,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x74,0x7d] 0x6b,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x74,0x7d] 0x7b,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x74,0x7d] 0x7d,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x74,0x7d] 0x7e,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x74,0x7d] 0x7f,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x74,0x7d] 0x7c,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x74,0x7d] 0xc1,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x74,0x7d] 0xf0,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x74,0x7d -# GFX11: v_cmpx_eq_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x74,0x7d] +# GFX11-REAL16: v_cmpx_eq_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x74,0x7d] 0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_eq_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x74,0x7d +# GFX11-REAL16: v_cmpx_eq_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x74,0x7d] + +0xff,0x05,0x74,0x7d +# GFX11-REAL16: v_cmpx_eq_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x74,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x74,0x7d] + +0xf0,0xfe,0x74,0x7d +# GFX11-REAL16: v_cmpx_eq_u16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x74,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x74,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x75,0x7d +# GFX11-REAL16: v_cmpx_eq_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x75,0x7d] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x75,0x7d] + +0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x94,0x7d # GFX11: v_cmpx_eq_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x94,0x7d] @@ -958,49 +1028,84 @@ # GFX11: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6c,0x7d] 0x7f,0x05,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6c,0x7d] 0x01,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6c,0x7d] 0x69,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6c,0x7d] 0x6a,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6c,0x7d] 0x6b,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6c,0x7d] 0x7b,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6c,0x7d] 0x7d,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6c,0x7d] 0x7e,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6c,0x7d] 0x7f,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6c,0x7d] 0x7c,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6c,0x7d] 0xc1,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6c,0x7d] 0xf0,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x6c,0x7d -# GFX11: v_cmpx_ge_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6c,0x7d] +# GFX11-REAL16: v_cmpx_ge_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6c,0x7d] 0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_ge_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x6c,0x7d +# GFX11-REAL16: v_cmpx_ge_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x6c,0x7d] + +0xff,0x05,0x6c,0x7d +# GFX11-REAL16: v_cmpx_ge_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x6c,0x7d] + +0xf0,0xfe,0x6c,0x7d +# GFX11-REAL16: v_cmpx_ge_i16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x6c,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x6c,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x6d,0x7d +# GFX11-REAL16: v_cmpx_ge_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6d,0x7d] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x6d,0x7d] + +0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x8c,0x7d # GFX11: v_cmpx_ge_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8c,0x7d] @@ -1084,49 +1189,84 @@ # GFX11: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7c,0x7d] 0x7f,0x05,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7c,0x7d] 0x01,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7c,0x7d] 0x69,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7c,0x7d] 0x6a,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7c,0x7d] 0x6b,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7c,0x7d] 0x7b,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7c,0x7d] 0x7d,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7c,0x7d] 0x7e,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7c,0x7d] 0x7f,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7c,0x7d] 0x7c,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7c,0x7d] 0xc1,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7c,0x7d] 0xf0,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x7c,0x7d -# GFX11: v_cmpx_ge_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7c,0x7d] +# GFX11-REAL16: v_cmpx_ge_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7c,0x7d] 0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_ge_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x7c,0x7d +# GFX11-REAL16: v_cmpx_ge_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x7c,0x7d] + +0xff,0x05,0x7c,0x7d +# GFX11-REAL16: v_cmpx_ge_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x7c,0x7d] + +0xf0,0xfe,0x7c,0x7d +# GFX11-REAL16: v_cmpx_ge_u16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x7c,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x7c,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x7d,0x7d +# GFX11-REAL16: v_cmpx_ge_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7d,0x7d] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x7d,0x7d] + +0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x9c,0x7d # GFX11: v_cmpx_ge_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9c,0x7d] @@ -1336,49 +1476,84 @@ # GFX11: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x68,0x7d] 0x7f,0x05,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x68,0x7d] 0x01,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x68,0x7d] 0x69,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x68,0x7d] 0x6a,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x68,0x7d] 0x6b,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x68,0x7d] 0x7b,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x68,0x7d] 0x7d,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x68,0x7d] 0x7e,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x68,0x7d] 0x7f,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x68,0x7d] 0x7c,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x68,0x7d] 0xc1,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x68,0x7d] 0xf0,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x68,0x7d -# GFX11: v_cmpx_gt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x68,0x7d] +# GFX11-REAL16: v_cmpx_gt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x68,0x7d] 0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_gt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x68,0x7d +# GFX11-REAL16: v_cmpx_gt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x68,0x7d] + +0xff,0x05,0x68,0x7d +# GFX11-REAL16: v_cmpx_gt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x68,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x68,0x7d] + +0xf0,0xfe,0x68,0x7d +# GFX11-REAL16: v_cmpx_gt_i16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x68,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x68,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x69,0x7d +# GFX11-REAL16: v_cmpx_gt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x69,0x7d] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x69,0x7d] + +0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x88,0x7d # GFX11: v_cmpx_gt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x88,0x7d] @@ -1462,49 +1637,84 @@ # GFX11: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x78,0x7d] 0x7f,0x05,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x78,0x7d] 0x01,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x78,0x7d] 0x69,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x78,0x7d] 0x6a,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x78,0x7d] 0x6b,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x78,0x7d] 0x7b,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x78,0x7d] 0x7d,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x78,0x7d] 0x7e,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x78,0x7d] 0x7f,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x78,0x7d] 0x7c,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x78,0x7d] 0xc1,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x78,0x7d] 0xf0,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x78,0x7d -# GFX11: v_cmpx_gt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x78,0x7d] +# GFX11-REAL16: v_cmpx_gt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x78,0x7d] 0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_gt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x78,0x7d +# GFX11-REAL16: v_cmpx_gt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x78,0x7d] + +0xff,0x05,0x78,0x7d +# GFX11-REAL16: v_cmpx_gt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x78,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x78,0x7d] + +0xf0,0xfe,0x78,0x7d +# GFX11-REAL16: v_cmpx_gt_u16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x78,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x78,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x79,0x7d +# GFX11-REAL16: v_cmpx_gt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x79,0x7d] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x79,0x7d] + +0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x98,0x7d # GFX11: v_cmpx_gt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x98,0x7d] @@ -1714,49 +1924,84 @@ # GFX11: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x66,0x7d] 0x7f,0x05,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x66,0x7d] 0x01,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x66,0x7d] 0x69,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x66,0x7d] 0x6a,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x66,0x7d] 0x6b,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x66,0x7d] 0x7b,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x66,0x7d] 0x7d,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x66,0x7d] 0x7e,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x66,0x7d] 0x7f,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x66,0x7d] 0x7c,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x66,0x7d] 0xc1,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x66,0x7d] 0xf0,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x66,0x7d -# GFX11: v_cmpx_le_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x66,0x7d] +# GFX11-REAL16: v_cmpx_le_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x66,0x7d] 0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_le_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x66,0x7d +# GFX11-REAL16: v_cmpx_le_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x66,0x7d] + +0xff,0x05,0x66,0x7d +# GFX11-REAL16: v_cmpx_le_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x66,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x66,0x7d] + +0xf0,0xfe,0x66,0x7d +# GFX11-REAL16: v_cmpx_le_i16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x66,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x66,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x67,0x7d +# GFX11-REAL16: v_cmpx_le_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x67,0x7d] +# GFX11-FAKE16: v_cmpx_le_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x67,0x7d] + +0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x86,0x7d # GFX11: v_cmpx_le_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x86,0x7d] @@ -1840,49 +2085,84 @@ # GFX11: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x76,0x7d] 0x7f,0x05,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x76,0x7d] 0x01,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x76,0x7d] 0x69,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x76,0x7d] 0x6a,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x76,0x7d] 0x6b,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x76,0x7d] 0x7b,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x76,0x7d] 0x7d,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x76,0x7d] 0x7e,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x76,0x7d] 0x7f,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x76,0x7d] 0x7c,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x76,0x7d] 0xc1,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x76,0x7d] 0xf0,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x76,0x7d -# GFX11: v_cmpx_le_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x76,0x7d] +# GFX11-REAL16: v_cmpx_le_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x76,0x7d] 0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_le_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x76,0x7d +# GFX11-REAL16: v_cmpx_le_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x76,0x7d] + +0xff,0x05,0x76,0x7d +# GFX11-REAL16: v_cmpx_le_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x76,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x76,0x7d] + +0xf0,0xfe,0x76,0x7d +# GFX11-REAL16: v_cmpx_le_u16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x76,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x76,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x77,0x7d +# GFX11-REAL16: v_cmpx_le_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x77,0x7d] +# GFX11-FAKE16: v_cmpx_le_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x77,0x7d] + +0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x96,0x7d # GFX11: v_cmpx_le_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x96,0x7d] @@ -2253,49 +2533,84 @@ # GFX11: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x62,0x7d] 0x7f,0x05,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x62,0x7d] 0x01,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x62,0x7d] 0x69,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x62,0x7d] 0x6a,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x62,0x7d] 0x6b,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x62,0x7d] 0x7b,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x62,0x7d] 0x7d,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x62,0x7d] 0x7e,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x62,0x7d] 0x7f,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x62,0x7d] 0x7c,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x62,0x7d] 0xc1,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x62,0x7d] 0xf0,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x62,0x7d -# GFX11: v_cmpx_lt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x62,0x7d] +# GFX11-REAL16: v_cmpx_lt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x62,0x7d] 0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_lt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x62,0x7d +# GFX11-REAL16: v_cmpx_lt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x62,0x7d] + +0xff,0x05,0x62,0x7d +# GFX11-REAL16: v_cmpx_lt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x62,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x62,0x7d] + +0xf0,0xfe,0x62,0x7d +# GFX11-REAL16: v_cmpx_lt_i16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x62,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x62,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x63,0x7d +# GFX11-REAL16: v_cmpx_lt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x63,0x7d] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x63,0x7d] + +0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_lt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x82,0x7d # GFX11: v_cmpx_lt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x82,0x7d] @@ -2379,49 +2694,84 @@ # GFX11: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x72,0x7d] 0x7f,0x05,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x72,0x7d] 0x01,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x72,0x7d] 0x69,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x72,0x7d] 0x6a,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x72,0x7d] 0x6b,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x72,0x7d] 0x7b,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x72,0x7d] 0x7d,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x72,0x7d] 0x7e,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x72,0x7d] 0x7f,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x72,0x7d] 0x7c,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x72,0x7d] 0xc1,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x72,0x7d] 0xf0,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x72,0x7d -# GFX11: v_cmpx_lt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x72,0x7d] +# GFX11-REAL16: v_cmpx_lt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x72,0x7d] 0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_lt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x72,0x7d +# GFX11-REAL16: v_cmpx_lt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x72,0x7d] + +0xff,0x05,0x72,0x7d +# GFX11-REAL16: v_cmpx_lt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x72,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x72,0x7d] + +0xf0,0xfe,0x72,0x7d +# GFX11-REAL16: v_cmpx_lt_u16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x72,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x72,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x73,0x7d +# GFX11-REAL16: v_cmpx_lt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x73,0x7d] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x73,0x7d] + +0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_lt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x92,0x7d # GFX11: v_cmpx_lt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x92,0x7d] @@ -2505,49 +2855,84 @@ # GFX11: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6a,0x7d] 0x7f,0x05,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6a,0x7d] 0x01,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6a,0x7d] 0x69,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6a,0x7d] 0x6a,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6a,0x7d] 0x6b,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6a,0x7d] 0x7b,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6a,0x7d] 0x7d,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6a,0x7d] 0x7e,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6a,0x7d] 0x7f,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6a,0x7d] 0x7c,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6a,0x7d] 0xc1,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6a,0x7d] 0xf0,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x6a,0x7d -# GFX11: v_cmpx_ne_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6a,0x7d] +# GFX11-REAL16: v_cmpx_ne_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6a,0x7d] 0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_ne_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x6a,0x7d +# GFX11-REAL16: v_cmpx_ne_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x6a,0x7d] + +0xff,0x05,0x6a,0x7d +# GFX11-REAL16: v_cmpx_ne_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x6a,0x7d] + +0xf0,0xfe,0x6a,0x7d +# GFX11-REAL16: v_cmpx_ne_i16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x6a,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x6a,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x6b,0x7d +# GFX11-REAL16: v_cmpx_ne_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6b,0x7d] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x6b,0x7d] + +0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_ne_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x8a,0x7d # GFX11: v_cmpx_ne_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8a,0x7d] @@ -2631,49 +3016,84 @@ # GFX11: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7a,0x7d] 0x7f,0x05,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7a,0x7d] 0x01,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7a,0x7d] 0x69,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7a,0x7d] 0x6a,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7a,0x7d] 0x6b,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7a,0x7d] 0x7b,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7a,0x7d] 0x7d,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7a,0x7d] 0x7e,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7a,0x7d] 0x7f,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7a,0x7d] 0x7c,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7a,0x7d] 0xc1,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7a,0x7d] 0xf0,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x7a,0x7d -# GFX11: v_cmpx_ne_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7a,0x7d] +# GFX11-REAL16: v_cmpx_ne_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7a,0x7d] 0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_ne_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x7a,0x7d +# GFX11-REAL16: v_cmpx_ne_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x7a,0x7d] + +0xff,0x05,0x7a,0x7d +# GFX11-REAL16: v_cmpx_ne_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7a,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x7a,0x7d] + +0xf0,0xfe,0x7a,0x7d +# GFX11-REAL16: v_cmpx_ne_u16_e32 0x3800, v127.l ; encoding: [0xff,0xfe,0x7a,0x7d,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 0x3800, v127 ; encoding: [0xff,0xfe,0x7a,0x7d,0x00,0x38,0x00,0x00] + +0xfd,0x04,0x7b,0x7d +# GFX11-REAL16: v_cmpx_ne_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7b,0x7d] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x7b,0x7d] + +0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_ne_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x9a,0x7d # GFX11: v_cmpx_ne_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9a,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt index f660760cd9c51b..5a57f93c65939c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt @@ -199,46 +199,72 @@ # GFX11: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_eq_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x64,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_eq_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_i16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x65,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_eq_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x65,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x65,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_eq_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff] @@ -283,46 +309,72 @@ # GFX11: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_eq_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x74,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_eq_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_u16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x75,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_eq_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x75,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x75,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_eq_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff] @@ -619,46 +671,72 @@ # GFX11: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ge_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x6c,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ge_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_i16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x6d,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ge_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6d,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ge_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff] @@ -703,46 +781,72 @@ # GFX11: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ge_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x7c,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ge_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_u16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x7d,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ge_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7d,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ge_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff] @@ -871,46 +975,72 @@ # GFX11: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_gt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x68,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_gt_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_i16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x69,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_gt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x69,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x69,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_gt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff] @@ -955,46 +1085,72 @@ # GFX11: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_gt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x78,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_gt_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_u16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x79,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_gt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x79,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x79,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_gt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff] @@ -1123,46 +1279,72 @@ # GFX11: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_le_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x66,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_le_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_i16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x67,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_le_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x67,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x67,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_le_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff] @@ -1207,46 +1389,72 @@ # GFX11: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_le_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x76,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_le_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_u16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x77,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_le_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x77,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x77,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_le_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff] @@ -1485,46 +1693,72 @@ # GFX11: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_lt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x62,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_lt_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_i16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x63,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_lt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x63,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x63,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_lt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff] @@ -1569,46 +1803,72 @@ # GFX11: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_lt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x72,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_lt_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lt_u16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x73,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_lt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x73,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x73,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_lt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff] @@ -1653,46 +1913,72 @@ # GFX11: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ne_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x6a,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ne_i16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_i16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x6b,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ne_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6b,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ne_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff] @@ -1737,46 +2023,72 @@ # GFX11: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ne_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0xfe,0x7a,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ne_u16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ne_u16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x7b,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ne_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7b,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ne_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ne_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ne_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt index f32adeb61b16ee..8350088ca95a59 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt @@ -43,10 +43,30 @@ # GFX11: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x64,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x64,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05] @@ -55,10 +75,30 @@ # GFX11: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x74,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x74,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05] @@ -103,10 +143,30 @@ # GFX11: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x6c,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6c,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05] @@ -115,10 +175,30 @@ # GFX11: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x7c,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7c,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05] @@ -139,10 +219,30 @@ # GFX11: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x68,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x68,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05] @@ -151,10 +251,30 @@ # GFX11: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x78,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x78,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05] @@ -175,10 +295,30 @@ # GFX11: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x66,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x66,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05] @@ -187,10 +327,30 @@ # GFX11: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x76,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x76,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05] @@ -237,10 +397,30 @@ # GFX11: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x62,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x62,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_lt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05] @@ -249,10 +429,30 @@ # GFX11: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_lt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x72,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x72,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_lt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05] @@ -261,10 +461,30 @@ # GFX11: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x6a,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_i16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x6a,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ne_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05] @@ -273,10 +493,30 @@ # GFX11: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ne_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x7a,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_u16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x7a,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ne_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ne_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ne_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ne_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt index fae898c04b6cbb..80235451fec6f3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt @@ -282,10 +282,12 @@ # GFX12: v_cmpx_eq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa2,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_eq_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00] @@ -326,6 +328,16 @@ 0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_eq_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00] @@ -408,10 +420,12 @@ # GFX12: v_cmpx_eq_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd2,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_eq_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00] @@ -452,6 +466,16 @@ 0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_eq_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00] @@ -660,10 +684,12 @@ # GFX12: v_cmpx_ge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa6,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_ge_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00] @@ -704,6 +730,16 @@ 0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ge_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00] @@ -786,10 +822,12 @@ # GFX12: v_cmpx_ge_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd6,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_ge_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00] @@ -830,6 +868,16 @@ 0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ge_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00] @@ -1038,10 +1086,12 @@ # GFX12: v_cmpx_gt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa4,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_gt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00] @@ -1082,6 +1132,16 @@ 0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_gt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00] @@ -1164,10 +1224,12 @@ # GFX12: v_cmpx_gt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd4,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_gt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00] @@ -1208,6 +1270,16 @@ 0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_gt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00] @@ -1416,10 +1488,12 @@ # GFX12: v_cmpx_le_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa3,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_le_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00] @@ -1460,6 +1534,16 @@ 0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_le_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00] @@ -1542,10 +1626,12 @@ # GFX12: v_cmpx_le_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd3,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_le_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00] @@ -1586,6 +1672,16 @@ 0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_le_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00] @@ -1930,10 +2026,12 @@ # GFX12: v_cmpx_lt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa1,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_lt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00] @@ -1974,6 +2072,16 @@ 0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_lt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00] @@ -2056,10 +2164,12 @@ # GFX12: v_cmpx_lt_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd1,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_lt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00] @@ -2100,6 +2210,16 @@ 0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_lt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00] @@ -2182,10 +2302,12 @@ # GFX12: v_cmpx_lt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd9,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_ne_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00] @@ -2226,6 +2348,16 @@ 0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ne_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00] @@ -2308,10 +2440,12 @@ # GFX12: v_cmpx_ne_i64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd5,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_ne_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00] @@ -2352,6 +2486,16 @@ 0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] + +# GFX11: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] + 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ne_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt index ffbdcba67ce18e..2dc231a4220f17 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt @@ -261,49 +261,126 @@ # GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 s105, s105 ; encoding: [0x7e,0x00,0xb2,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xb2,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xb2,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_eq_i16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xb2,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xb2,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_eq_i16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xb2,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_eq_i16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xb2,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xb2,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xb2,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xb2,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -354,49 +431,126 @@ # GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 s105, s105 ; encoding: [0x7e,0x00,0xba,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xba,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xba,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_eq_u16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xba,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xba,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_eq_u16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xba,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_eq_u16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xba,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xba,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xba,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xba,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -534,49 +688,126 @@ # GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 s105, s105 ; encoding: [0x7e,0x00,0xb6,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xb6,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xb6,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_ge_i16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xb6,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xb6,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_ge_i16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xb6,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_ge_i16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xb6,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xb6,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xb6,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xb6,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -624,49 +855,126 @@ # GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 s105, s105 ; encoding: [0x7e,0x00,0xbe,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xbe,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xbe,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_ge_u16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xbe,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xbe,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_ge_u16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xbe,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_ge_u16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xbe,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xbe,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xbe,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xbe,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -804,49 +1112,126 @@ # GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 s105, s105 ; encoding: [0x7e,0x00,0xb4,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xb4,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xb4,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_gt_i16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xb4,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xb4,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_gt_i16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xb4,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_gt_i16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xb4,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xb4,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xb4,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xb4,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -894,49 +1279,126 @@ # GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 s105, s105 ; encoding: [0x7e,0x00,0xbc,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xbc,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xbc,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_gt_u16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xbc,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xbc,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_gt_u16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xbc,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_gt_u16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xbc,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xbc,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xbc,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xbc,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1074,49 +1536,126 @@ # GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 s105, s105 ; encoding: [0x7e,0x00,0xb3,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xb3,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xb3,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_le_i16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xb3,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xb3,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_le_i16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xb3,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_le_i16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xb3,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xb3,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xb3,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xb3,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1164,49 +1703,126 @@ # GFX12: v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 s105, s105 ; encoding: [0x7e,0x00,0xbb,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xbb,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xbb,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_le_u16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xbb,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xbb,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_le_u16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xbb,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_le_u16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xbb,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xbb,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xbb,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xbb,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1508,49 +2124,126 @@ # GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 s105, s105 ; encoding: [0x7e,0x00,0xb1,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xb1,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xb1,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_lt_i16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xb1,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xb1,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_lt_i16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xb1,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_lt_i16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xb1,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xb1,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xb1,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xb1,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1598,49 +2291,126 @@ # GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 s105, s105 ; encoding: [0x7e,0x00,0xb9,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xb9,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xb9,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_lt_u16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xb9,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xb9,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_lt_u16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xb9,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_lt_u16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xb9,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xb9,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xb9,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xb9,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1688,49 +2458,126 @@ # GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 s1, s2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 s105, s105 ; encoding: [0x7e,0x00,0xb5,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xb5,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xb5,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_ne_i16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xb5,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xb5,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_ne_i16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xb5,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_ne_i16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xb5,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xb5,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xb5,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xb5,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1778,49 +2625,126 @@ # GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 s1, s2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 s105, s105 ; encoding: [0x7e,0x00,0xbd,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0xbd,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0xbd,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_ne_u16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0xbd,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0xbd,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_ne_u16_e64 m0, 0x3800 ; encoding: [0x7e,0x00,0xbd,0xd4,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_ne_u16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0xbd,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 exec_hi, null ; encoding: [0x7e,0x00,0xbd,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 null, exec_lo ; encoding: [0x7e,0x00,0xbd,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0xf0,0xfa,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 0x3800, m0 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 src_scc, vcc_lo ; encoding: [0x7e,0x00,0xbd,0xd4,0xfd,0xd4,0x00,0x00] + +0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +# GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt index ae945cbad54aea..cff9497778265a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt @@ -71,13 +71,31 @@ # GFX12: v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x92,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -89,13 +107,31 @@ # GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -140,13 +176,31 @@ # GFX12: v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x96,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -158,13 +212,31 @@ # GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -206,13 +278,31 @@ # GFX12: v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x94,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -224,13 +314,31 @@ # GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -272,13 +380,31 @@ # GFX12: v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x93,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -290,13 +416,31 @@ # GFX12: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -385,13 +529,31 @@ # GFX12: v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x91,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -403,13 +565,31 @@ # GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -421,13 +601,31 @@ # GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -442,13 +640,31 @@ # GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +# GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt index ac83043628cb48..6ca815a1c88d3b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt @@ -288,49 +288,80 @@ # GFX12: v_cmpx_eq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x45,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x64,0x7d] 0x7f,0x05,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x64,0x7d] 0x01,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x64,0x7d] 0x69,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x64,0x7d] 0x6a,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x64,0x7d] 0x6b,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x64,0x7d] 0x7b,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x64,0x7d] 0x7d,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x64,0x7d] 0x7e,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x64,0x7d] 0x7f,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x64,0x7d] 0x7c,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x64,0x7d] 0xc1,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x64,0x7d] 0xf0,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x64,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x64,0x7d -# GFX12: v_cmpx_eq_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x64,0x7d] +# GFX12-REAL16: v_cmpx_eq_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x64,0x7d] 0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_eq_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x64,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x64,0x7d +# GFX12-REAL16: v_cmpx_eq_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x64,0x7d] + +0xff,0x05,0x64,0x7d +# GFX12-REAL16: v_cmpx_eq_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x64,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x64,0x7d] + +0xfd,0x04,0x65,0x7d +# GFX12-REAL16: v_cmpx_eq_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x65,0x7d] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x65,0x7d] + +0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x65,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x84,0x7d # GFX12: v_cmpx_eq_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x84,0x7d] @@ -414,49 +445,80 @@ # GFX12: v_cmpx_eq_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa5,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x74,0x7d] 0x7f,0x05,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x74,0x7d] 0x01,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x74,0x7d] 0x69,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x74,0x7d] 0x6a,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x74,0x7d] 0x6b,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x74,0x7d] 0x7b,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x74,0x7d] 0x7d,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x74,0x7d] 0x7e,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x74,0x7d] 0x7f,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x74,0x7d] 0x7c,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x74,0x7d] 0xc1,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x74,0x7d] 0xf0,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x74,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x74,0x7d -# GFX12: v_cmpx_eq_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x74,0x7d] +# GFX12-REAL16: v_cmpx_eq_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x74,0x7d] 0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_eq_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x74,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x74,0x7d +# GFX12-REAL16: v_cmpx_eq_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x74,0x7d] + +0xff,0x05,0x74,0x7d +# GFX12-REAL16: v_cmpx_eq_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x74,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x74,0x7d] + +0xfd,0x04,0x75,0x7d +# GFX12-REAL16: v_cmpx_eq_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x75,0x7d] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x75,0x7d] + +0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x75,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x94,0x7d # GFX12: v_cmpx_eq_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x94,0x7d] @@ -666,49 +728,80 @@ # GFX12: v_cmpx_ge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4d,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6c,0x7d] 0x7f,0x05,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6c,0x7d] 0x01,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6c,0x7d] 0x69,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6c,0x7d] 0x6a,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6c,0x7d] 0x6b,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6c,0x7d] 0x7b,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6c,0x7d] 0x7d,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6c,0x7d] 0x7e,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6c,0x7d] 0x7f,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6c,0x7d] 0x7c,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6c,0x7d] 0xc1,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6c,0x7d] 0xf0,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6c,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x6c,0x7d -# GFX12: v_cmpx_ge_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6c,0x7d] +# GFX12-REAL16: v_cmpx_ge_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6c,0x7d] 0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_ge_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x6c,0x7d +# GFX12-REAL16: v_cmpx_ge_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x6c,0x7d] + +0xff,0x05,0x6c,0x7d +# GFX12-REAL16: v_cmpx_ge_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x6c,0x7d] + +0xfd,0x04,0x6d,0x7d +# GFX12-REAL16: v_cmpx_ge_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6d,0x7d] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x6d,0x7d] + +0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x6d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x8c,0x7d # GFX12: v_cmpx_ge_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8c,0x7d] @@ -792,49 +885,80 @@ # GFX12: v_cmpx_ge_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xad,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7c,0x7d] 0x7f,0x05,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7c,0x7d] 0x01,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7c,0x7d] 0x69,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7c,0x7d] 0x6a,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7c,0x7d] 0x6b,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7c,0x7d] 0x7b,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7c,0x7d] 0x7d,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7c,0x7d] 0x7e,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7c,0x7d] 0x7f,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7c,0x7d] 0x7c,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7c,0x7d] 0xc1,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7c,0x7d] 0xf0,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7c,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x7c,0x7d -# GFX12: v_cmpx_ge_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7c,0x7d] +# GFX12-REAL16: v_cmpx_ge_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7c,0x7d] 0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_ge_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x7c,0x7d +# GFX12-REAL16: v_cmpx_ge_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x7c,0x7d] + +0xff,0x05,0x7c,0x7d +# GFX12-REAL16: v_cmpx_ge_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x7c,0x7d] + +0xfd,0x04,0x7d,0x7d +# GFX12-REAL16: v_cmpx_ge_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7d,0x7d] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x7d,0x7d] + +0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x7d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x9c,0x7d # GFX12: v_cmpx_ge_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9c,0x7d] @@ -1044,49 +1168,80 @@ # GFX12: v_cmpx_gt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x49,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x68,0x7d] 0x7f,0x05,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x68,0x7d] 0x01,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x68,0x7d] 0x69,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x68,0x7d] 0x6a,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x68,0x7d] 0x6b,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x68,0x7d] 0x7b,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x68,0x7d] 0x7d,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x68,0x7d] 0x7e,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x68,0x7d] 0x7f,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x68,0x7d] 0x7c,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x68,0x7d] 0xc1,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x68,0x7d] 0xf0,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x68,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x68,0x7d -# GFX12: v_cmpx_gt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x68,0x7d] +# GFX12-REAL16: v_cmpx_gt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x68,0x7d] 0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_gt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x68,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x68,0x7d +# GFX12-REAL16: v_cmpx_gt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x68,0x7d] + +0xff,0x05,0x68,0x7d +# GFX12-REAL16: v_cmpx_gt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x68,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x68,0x7d] + +0xfd,0x04,0x69,0x7d +# GFX12-REAL16: v_cmpx_gt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x69,0x7d] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x69,0x7d] + +0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x69,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x88,0x7d # GFX12: v_cmpx_gt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x88,0x7d] @@ -1170,49 +1325,80 @@ # GFX12: v_cmpx_gt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa9,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x78,0x7d] 0x7f,0x05,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x78,0x7d] 0x01,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x78,0x7d] 0x69,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x78,0x7d] 0x6a,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x78,0x7d] 0x6b,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x78,0x7d] 0x7b,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x78,0x7d] 0x7d,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x78,0x7d] 0x7e,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x78,0x7d] 0x7f,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x78,0x7d] 0x7c,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x78,0x7d] 0xc1,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x78,0x7d] 0xf0,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x78,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x78,0x7d -# GFX12: v_cmpx_gt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x78,0x7d] +# GFX12-REAL16: v_cmpx_gt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x78,0x7d] 0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_gt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x78,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x78,0x7d +# GFX12-REAL16: v_cmpx_gt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x78,0x7d] + +0xff,0x05,0x78,0x7d +# GFX12-REAL16: v_cmpx_gt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x78,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x78,0x7d] + +0xfd,0x04,0x79,0x7d +# GFX12-REAL16: v_cmpx_gt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x79,0x7d] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x79,0x7d] + +0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x79,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x98,0x7d # GFX12: v_cmpx_gt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x98,0x7d] @@ -1422,49 +1608,80 @@ # GFX12: v_cmpx_le_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x47,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x66,0x7d] 0x7f,0x05,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x66,0x7d] 0x01,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x66,0x7d] 0x69,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x66,0x7d] 0x6a,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x66,0x7d] 0x6b,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x66,0x7d] 0x7b,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x66,0x7d] 0x7d,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x66,0x7d] 0x7e,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x66,0x7d] 0x7f,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x66,0x7d] 0x7c,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x66,0x7d] 0xc1,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x66,0x7d] 0xf0,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x66,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x66,0x7d -# GFX12: v_cmpx_le_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x66,0x7d] +# GFX12-REAL16: v_cmpx_le_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x66,0x7d] 0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_le_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x66,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x66,0x7d +# GFX12-REAL16: v_cmpx_le_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x66,0x7d] + +0xff,0x05,0x66,0x7d +# GFX12-REAL16: v_cmpx_le_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x66,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x66,0x7d] + +0xfd,0x04,0x67,0x7d +# GFX12-REAL16: v_cmpx_le_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x67,0x7d] +# GFX12-FAKE16: v_cmpx_le_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x67,0x7d] + +0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x67,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x86,0x7d # GFX12: v_cmpx_le_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x86,0x7d] @@ -1548,49 +1765,80 @@ # GFX12: v_cmpx_le_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa7,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x76,0x7d] 0x7f,0x05,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x76,0x7d] 0x01,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x76,0x7d] 0x69,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x76,0x7d] 0x6a,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x76,0x7d] 0x6b,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x76,0x7d] 0x7b,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x76,0x7d] 0x7d,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x76,0x7d] 0x7e,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x76,0x7d] 0x7f,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x76,0x7d] 0x7c,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x76,0x7d] 0xc1,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x76,0x7d] 0xf0,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x76,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x76,0x7d -# GFX12: v_cmpx_le_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x76,0x7d] +# GFX12-REAL16: v_cmpx_le_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x76,0x7d] 0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_le_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x76,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x76,0x7d +# GFX12-REAL16: v_cmpx_le_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x76,0x7d] + +0xff,0x05,0x76,0x7d +# GFX12-REAL16: v_cmpx_le_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x76,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x76,0x7d] + +0xfd,0x04,0x77,0x7d +# GFX12-REAL16: v_cmpx_le_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x77,0x7d] +# GFX12-FAKE16: v_cmpx_le_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x77,0x7d] + +0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x77,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x96,0x7d # GFX12: v_cmpx_le_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x96,0x7d] @@ -1957,49 +2205,80 @@ # GFX12: v_cmpx_lt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x43,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x62,0x7d] 0x7f,0x05,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x62,0x7d] 0x01,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x62,0x7d] 0x69,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x62,0x7d] 0x6a,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x62,0x7d] 0x6b,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x62,0x7d] 0x7b,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x62,0x7d] 0x7d,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x62,0x7d] 0x7e,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x62,0x7d] 0x7f,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x62,0x7d] 0x7c,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x62,0x7d] 0xc1,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x62,0x7d] 0xf0,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x62,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x62,0x7d -# GFX12: v_cmpx_lt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x62,0x7d] +# GFX12-REAL16: v_cmpx_lt_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x62,0x7d] 0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_lt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x62,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x62,0x7d +# GFX12-REAL16: v_cmpx_lt_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x62,0x7d] + +0xff,0x05,0x62,0x7d +# GFX12-REAL16: v_cmpx_lt_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x62,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x62,0x7d] + +0xfd,0x04,0x63,0x7d +# GFX12-REAL16: v_cmpx_lt_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x63,0x7d] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x63,0x7d] + +0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_lt_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x63,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x82,0x7d # GFX12: v_cmpx_lt_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x82,0x7d] @@ -2083,49 +2362,80 @@ # GFX12: v_cmpx_lt_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xa3,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x72,0x7d] 0x7f,0x05,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x72,0x7d] 0x01,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x72,0x7d] 0x69,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x72,0x7d] 0x6a,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x72,0x7d] 0x6b,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x72,0x7d] 0x7b,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x72,0x7d] 0x7d,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x72,0x7d] 0x7e,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x72,0x7d] 0x7f,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x72,0x7d] 0x7c,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x72,0x7d] 0xc1,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x72,0x7d] 0xf0,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x72,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x72,0x7d -# GFX12: v_cmpx_lt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x72,0x7d] +# GFX12-REAL16: v_cmpx_lt_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x72,0x7d] 0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_lt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x72,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x72,0x7d +# GFX12-REAL16: v_cmpx_lt_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x72,0x7d] + +0xff,0x05,0x72,0x7d +# GFX12-REAL16: v_cmpx_lt_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x72,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x72,0x7d] + +0xfd,0x04,0x73,0x7d +# GFX12-REAL16: v_cmpx_lt_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x73,0x7d] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x73,0x7d] + +0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_lt_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x73,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x92,0x7d # GFX12: v_cmpx_lt_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x92,0x7d] @@ -2209,49 +2519,80 @@ # GFX12: v_cmpx_lt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb3,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 v1, v2 ; encoding: [0x01,0x05,0x6a,0x7d] 0x7f,0x05,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 v127, v2 ; encoding: [0x7f,0x05,0x6a,0x7d] 0x01,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 s1, v2.l ; encoding: [0x01,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 s1, v2 ; encoding: [0x01,0x04,0x6a,0x7d] 0x69,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 s105, v2.l ; encoding: [0x69,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 s105, v2 ; encoding: [0x69,0x04,0x6a,0x7d] 0x6a,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x6a,0x7d] 0x6b,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x6a,0x7d] 0x7b,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x6a,0x7d] 0x7d,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 m0, v2 ; encoding: [0x7d,0x04,0x6a,0x7d] 0x7e,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x6a,0x7d] 0x7f,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x6a,0x7d] 0x7c,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 null, v2.l ; encoding: [0x7c,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 null, v2 ; encoding: [0x7c,0x04,0x6a,0x7d] 0xc1,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 -1, v2 ; encoding: [0xc1,0x04,0x6a,0x7d] 0xf0,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x6a,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x6a,0x7d -# GFX12: v_cmpx_ne_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6a,0x7d] +# GFX12-REAL16: v_cmpx_ne_i16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x6a,0x7d] 0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_ne_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x6a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x6a,0x7d +# GFX12-REAL16: v_cmpx_ne_i16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x6a,0x7d] + +0xff,0x05,0x6a,0x7d +# GFX12-REAL16: v_cmpx_ne_i16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x6a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x6a,0x7d] + +0xfd,0x04,0x6b,0x7d +# GFX12-REAL16: v_cmpx_ne_i16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x6b,0x7d] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x6b,0x7d] + +0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_ne_i16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x6b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x8a,0x7d # GFX12: v_cmpx_ne_i32_e32 v1, v2 ; encoding: [0x01,0x05,0x8a,0x7d] @@ -2335,49 +2676,80 @@ # GFX12: v_cmpx_ne_i64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xab,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 v1, v2 ; encoding: [0x01,0x05,0x7a,0x7d] 0x7f,0x05,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 v127, v2 ; encoding: [0x7f,0x05,0x7a,0x7d] 0x01,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 s1, v2.l ; encoding: [0x01,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 s1, v2 ; encoding: [0x01,0x04,0x7a,0x7d] 0x69,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 s105, v2.l ; encoding: [0x69,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 s105, v2 ; encoding: [0x69,0x04,0x7a,0x7d] 0x6a,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x7a,0x7d] 0x6b,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x7a,0x7d] 0x7b,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x7a,0x7d] 0x7d,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 m0, v2 ; encoding: [0x7d,0x04,0x7a,0x7d] 0x7e,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x7a,0x7d] 0x7f,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x7a,0x7d] 0x7c,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 null, v2.l ; encoding: [0x7c,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 null, v2 ; encoding: [0x7c,0x04,0x7a,0x7d] 0xc1,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 -1, v2 ; encoding: [0xc1,0x04,0x7a,0x7d] 0xf0,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e32 0x3800, v2.l ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 0x3800, v2 ; encoding: [0xff,0x04,0x7a,0x7d,0x00,0x38,0x00,0x00] 0xfd,0x04,0x7a,0x7d -# GFX12: v_cmpx_ne_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7a,0x7d] +# GFX12-REAL16: v_cmpx_ne_u16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x7a,0x7d] 0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_ne_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x7a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x7a,0x7d +# GFX12-REAL16: v_cmpx_ne_u16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x7a,0x7d] + +0xff,0x05,0x7a,0x7d +# GFX12-REAL16: v_cmpx_ne_u16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x7a,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x7a,0x7d] + +0xfd,0x04,0x7b,0x7d +# GFX12-REAL16: v_cmpx_ne_u16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x7b,0x7d] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x7b,0x7d] + +0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_ne_u16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x7b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x9a,0x7d # GFX12: v_cmpx_ne_u32_e32 v1, v2 ; encoding: [0x01,0x05,0x9a,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt index 0db4a98489683f..f1fca291204908 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt @@ -195,46 +195,68 @@ # GFX12: v_cmpx_eq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x25,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x64,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_eq_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x64,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x65,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_eq_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x65,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x65,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_eq_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x65,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_eq_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x84,0x7d,0x01,0x1b,0x00,0xff] @@ -279,46 +301,68 @@ # GFX12: v_cmpx_eq_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x85,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x74,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_eq_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x74,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x75,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_eq_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x75,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x75,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_eq_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x75,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_eq_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x94,0x7d,0x01,0x1b,0x00,0xff] @@ -447,46 +491,68 @@ # GFX12: v_cmpx_ge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ge_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6c,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x6d,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ge_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6d,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ge_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ge_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8c,0x7d,0x01,0x1b,0x00,0xff] @@ -531,46 +597,68 @@ # GFX12: v_cmpx_ge_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ge_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7c,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x7d,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ge_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7d,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ge_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ge_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9c,0x7d,0x01,0x1b,0x00,0xff] @@ -699,46 +787,68 @@ # GFX12: v_cmpx_gt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x29,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x68,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_gt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x68,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x69,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_gt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x69,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x69,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_gt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x69,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_gt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x88,0x7d,0x01,0x1b,0x00,0xff] @@ -783,46 +893,68 @@ # GFX12: v_cmpx_gt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x89,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x78,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_gt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x78,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x79,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_gt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x79,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x79,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_gt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x79,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_gt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x98,0x7d,0x01,0x1b,0x00,0xff] @@ -951,46 +1083,68 @@ # GFX12: v_cmpx_le_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x27,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x66,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_le_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x66,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x67,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_le_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x67,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x67,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_le_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x67,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_le_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x86,0x7d,0x01,0x1b,0x00,0xff] @@ -1035,46 +1189,68 @@ # GFX12: v_cmpx_le_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x87,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x76,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_le_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x76,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x77,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_le_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x77,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x77,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_le_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x77,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_le_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x96,0x7d,0x01,0x1b,0x00,0xff] @@ -1309,46 +1485,68 @@ # GFX12: v_cmpx_lt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x23,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x62,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_lt_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x62,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x63,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_lt_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x63,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x63,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_lt_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x63,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_lt_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x82,0x7d,0x01,0x1b,0x00,0xff] @@ -1393,46 +1591,68 @@ # GFX12: v_cmpx_lt_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x83,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x72,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_lt_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x72,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x73,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_lt_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x73,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x73,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_lt_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x73,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_lt_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x92,0x7d,0x01,0x1b,0x00,0xff] @@ -1477,46 +1697,68 @@ # GFX12: v_cmpx_lt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x93,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ne_i16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6a,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x6b,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ne_i16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6b,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x6b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ne_i16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x6b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ne_i32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x8a,0x7d,0x01,0x1b,0x00,0xff] @@ -1561,46 +1803,68 @@ # GFX12: v_cmpx_ne_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x8b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ne_u16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7a,0x7d,0x7f,0x6f,0x0d,0x30] + +0xfa,0x04,0x7b,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ne_u16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7b,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ne_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x7b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ne_u16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ne_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x7b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ne_u32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x9a,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt index 9bcc6a89ff5d8e..b2539ad5a49e72 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt @@ -39,10 +39,20 @@ # GFX12: v_cmpx_eq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x25,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x64,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x64,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x65,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x65,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_eq_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x84,0x7d,0x01,0x77,0x39,0x05] @@ -51,10 +61,20 @@ # GFX12: v_cmpx_eq_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x85,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x74,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x74,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x75,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x75,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x94,0x7d,0x01,0x77,0x39,0x05] @@ -75,10 +95,20 @@ # GFX12: v_cmpx_ge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ge_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8c,0x7d,0x01,0x77,0x39,0x05] @@ -87,10 +117,20 @@ # GFX12: v_cmpx_ge_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9c,0x7d,0x01,0x77,0x39,0x05] @@ -111,10 +151,20 @@ # GFX12: v_cmpx_gt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x29,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x68,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x68,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x69,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x69,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_gt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x88,0x7d,0x01,0x77,0x39,0x05] @@ -123,10 +173,20 @@ # GFX12: v_cmpx_gt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x89,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x78,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x78,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x79,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x79,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x98,0x7d,0x01,0x77,0x39,0x05] @@ -147,10 +207,20 @@ # GFX12: v_cmpx_le_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x27,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x66,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x66,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x67,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x67,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_le_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x86,0x7d,0x01,0x77,0x39,0x05] @@ -159,10 +229,20 @@ # GFX12: v_cmpx_le_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x87,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x76,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x76,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x77,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x77,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x96,0x7d,0x01,0x77,0x39,0x05] @@ -205,10 +285,20 @@ # GFX12: v_cmpx_lt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x23,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x62,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x62,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lt_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x63,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_lt_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x63,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_lt_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x82,0x7d,0x01,0x77,0x39,0x05] @@ -217,10 +307,20 @@ # GFX12: v_cmpx_lt_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x83,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lt_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x72,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lt_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x72,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lt_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lt_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x73,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_lt_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lt_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x73,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_lt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x92,0x7d,0x01,0x77,0x39,0x05] @@ -229,10 +329,20 @@ # GFX12: v_cmpx_lt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x93,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_i16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_i16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ne_i16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_i16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x6b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ne_i16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_i16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x6b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ne_i32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x8a,0x7d,0x01,0x77,0x39,0x05] @@ -241,10 +351,20 @@ # GFX12: v_cmpx_ne_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x8b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ne_u16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ne_u16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ne_u16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ne_u16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x7b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ne_u16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ne_u16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x7b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x9a,0x7d,0x01,0x77,0x39,0x05] From 213e03ca1174177370715a8776a6423ee29b10ca Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 22 Jan 2025 22:00:17 +0100 Subject: [PATCH 038/208] [Clang] Fix handling of immediate escalation for inherited constructors (#112860) Fixes #112677 --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Sema/Sema.h | 2 +- clang/lib/AST/Decl.cpp | 11 +++++++ .../SemaCXX/cxx2b-consteval-propagate.cpp | 32 +++++++++++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index cad17c1b3957b6..c749e34d6d2c5d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -971,6 +971,7 @@ Bug Fixes to C++ Support - Fixed canonicalization of pack indexing types - Clang did not always recognized identical pack indexing. (#GH123033) - Fixed a nested lambda substitution issue for constraint evaluation. (#GH123441) - Fixed various false diagnostics related to the use of immediate functions. (#GH123472) +- Fix immediate escalation not propagating through inherited constructors. (#GH112677) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 9fa33d6ca76ba5..9a9998b114e0f7 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13086,7 +13086,7 @@ class Sema final : public SemaBase { auto *FD = dyn_cast(DC); S.PushFunctionScope(); S.PushExpressionEvaluationContext( - (FD && FD->isConsteval()) + (FD && FD->isImmediateFunction()) ? ExpressionEvaluationContext::ImmediateFunctionContext : ExpressionEvaluationContext::PotentiallyEvaluated); if (FD) { diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index a1a51d38b93e1f..ddde16ada5af88 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3283,6 +3283,11 @@ bool FunctionDecl::isImmediateEscalating() const { // consteval specifier, if (isDefaulted() && !isConsteval()) return true; + + if (auto *CD = dyn_cast(this); + CD && CD->isInheritingConstructor()) + return CD->getInheritedConstructor().getConstructor(); + // - a function that results from the instantiation of a templated entity // defined with the constexpr specifier. TemplatedKind TK = getTemplatedKind(); @@ -3303,6 +3308,12 @@ bool FunctionDecl::isImmediateFunction() const { if (isImmediateEscalating() && BodyContainsImmediateEscalatingExpressions()) return true; + if (auto *CD = dyn_cast(this); + CD && CD->isInheritingConstructor()) + return CD->getInheritedConstructor() + .getConstructor() + ->isImmediateFunction(); + if (const auto *MD = dyn_cast(this); MD && MD->isLambdaStaticInvoker()) return MD->getParent()->getLambdaCallOperator()->isImmediateFunction(); diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp index 378414f1361729..3f3123eaee76b6 100644 --- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp +++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp @@ -496,3 +496,35 @@ struct Y { template void g(); } + +namespace GH112677 { + +class ConstEval { + public: + consteval ConstEval(int); // expected-note 2{{declared here}} +}; + +struct TemplateCtor { + ConstEval val; + template constexpr + TemplateCtor(int arg) : val(arg) {} // expected-note {{undefined constructor 'ConstEval'}} +}; +struct C : TemplateCtor { + using TemplateCtor::TemplateCtor; // expected-note {{in call to 'TemplateCtor(0)'}} +}; + +C c(0); // expected-note{{in implicit initialization for inherited constructor of 'C'}} +// expected-error@-1 {{call to immediate function 'GH112677::C::TemplateCtor' is not a constant expression}} + +struct SimpleCtor { constexpr SimpleCtor(int) {}}; +struct D : SimpleCtor { + int y = 10; + ConstEval x = y; // expected-note {{undefined constructor 'ConstEval'}} + using SimpleCtor::SimpleCtor; + //expected-note@-1 {{'SimpleCtor' is an immediate constructor because the default initializer of 'x' contains a call to a consteval constructor 'ConstEval' and that call is not a constant expression}} +}; + +D d(0); // expected-note {{in implicit initialization for inherited constructor of 'D'}} +// expected-error@-1 {{call to immediate function 'GH112677::D::SimpleCtor' is not a constant expression}} + +} From c6e7b4a61ab8718d9ac9d1d32f7d2d0cd0b19a7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 22 Jan 2025 13:09:56 -0800 Subject: [PATCH 039/208] [flang][cuda][NFC] Add kernel name in translation error (#123987) --- flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp index 0c1424d11b515c..7ed7f355959683 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp +++ b/flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp @@ -71,7 +71,8 @@ LogicalResult registerKernel(cuf::RegisterKernelOp op, llvm::Function *fctSym = moduleTranslation.lookupFunction(op.getKernelName().str()); if (!fctSym) - return op.emitError() << "Couldn't find kernel name symbol"; + return op.emitError() << "Couldn't find kernel name symbol: " + << op.getKernelName().str(); builder.CreateCall(fct, {modulePtr, fctSym, getOrCreateFunctionName( module, builder, op.getKernelModuleName().str(), From 2656928d0ca78e38c91315020876755e46ccecbf Mon Sep 17 00:00:00 2001 From: Deric Cheung Date: Wed, 22 Jan 2025 13:29:19 -0800 Subject: [PATCH 040/208] Reland "[HLSL] Implement the `reflect` HLSL function" (#123853) This PR relands [#122992](https://github.com/llvm/llvm-project/pull/122992). Some machines were failing to run the `reflect-error.ll` test due to the RUN lines ```llvm ; RUN: not %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 -filetype=obj %} ; RUN: not %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 -filetype=obj %} ``` which failed when `spirv-tools` was not present on the machine due to running the command `not` without any arguments. These RUN lines have been removed since they don't actually test anything new compared to the other two RUN lines due to the expected error during instruction selection. ```llvm ; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s ; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s ``` --- clang/include/clang/Basic/BuiltinsSPIRV.td | 6 + clang/lib/CodeGen/CGBuiltin.cpp | 13 ++ clang/lib/Headers/hlsl/hlsl_detail.h | 16 ++ clang/lib/Headers/hlsl/hlsl_intrinsics.h | 43 +++++ clang/lib/Sema/SemaSPIRV.cpp | 32 ++++ clang/test/CodeGenHLSL/builtins/reflect.hlsl | 177 ++++++++++++++++++ clang/test/CodeGenSPIRV/Builtins/reflect.c | 32 ++++ .../SemaHLSL/BuiltIns/reflect-errors.hlsl | 33 ++++ .../test/SemaSPIRV/BuiltIns/reflect-errors.c | 23 +++ llvm/include/llvm/IR/IntrinsicsSPIRV.td | 1 + .../Target/SPIRV/SPIRVInstructionSelector.cpp | 16 +- .../CodeGen/SPIRV/hlsl-intrinsics/reflect.ll | 33 ++++ .../CodeGen/SPIRV/opencl/reflect-error.ll | 13 ++ 13 files changed, 434 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGenHLSL/builtins/reflect.hlsl create mode 100644 clang/test/CodeGenSPIRV/Builtins/reflect.c create mode 100644 clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl create mode 100644 clang/test/SemaSPIRV/BuiltIns/reflect-errors.c create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll create mode 100644 llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll diff --git a/clang/include/clang/Basic/BuiltinsSPIRV.td b/clang/include/clang/Basic/BuiltinsSPIRV.td index f72c555921dfe6..34933e889ba314 100644 --- a/clang/include/clang/Basic/BuiltinsSPIRV.td +++ b/clang/include/clang/Basic/BuiltinsSPIRV.td @@ -19,3 +19,9 @@ def SPIRVLength : Builtin { let Attributes = [NoThrow, Const]; let Prototype = "void(...)"; } + +def SPIRVReflect : Builtin { + let Spellings = ["__builtin_spirv_reflect"]; + let Attributes = [NoThrow, Const]; + let Prototype = "void(...)"; +} diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f1515347fb816c..d1a533ca8d7091 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20433,6 +20433,19 @@ Value *CodeGenFunction::EmitSPIRVBuiltinExpr(unsigned BuiltinID, /*ReturnType=*/X->getType()->getScalarType(), Intrinsic::spv_length, ArrayRef{X}, nullptr, "spv.length"); } + case SPIRV::BI__builtin_spirv_reflect: { + Value *I = EmitScalarExpr(E->getArg(0)); + Value *N = EmitScalarExpr(E->getArg(1)); + assert(E->getArg(0)->getType()->hasFloatingRepresentation() && + E->getArg(1)->getType()->hasFloatingRepresentation() && + "Reflect operands must have a float representation"); + assert(E->getArg(0)->getType()->isVectorType() && + E->getArg(1)->getType()->isVectorType() && + "Reflect operands must be a vector"); + return Builder.CreateIntrinsic( + /*ReturnType=*/I->getType(), Intrinsic::spv_reflect, + ArrayRef{I, N}, nullptr, "spv.reflect"); + } } return nullptr; } diff --git a/clang/lib/Headers/hlsl/hlsl_detail.h b/clang/lib/Headers/hlsl/hlsl_detail.h index b2c8cc6c5c3dbb..0d568539cd66a8 100644 --- a/clang/lib/Headers/hlsl/hlsl_detail.h +++ b/clang/lib/Headers/hlsl/hlsl_detail.h @@ -79,6 +79,22 @@ constexpr enable_if_t::value || is_same::value, T> distance_vec_impl(vector X, vector Y) { return length_vec_impl(X - Y); } + +template +constexpr enable_if_t::value || is_same::value, T> +reflect_impl(T I, T N) { + return I - 2 * N * I * N; +} + +template +constexpr vector reflect_vec_impl(vector I, vector N) { +#if (__has_builtin(__builtin_spirv_reflect)) + return __builtin_spirv_reflect(I, N); +#else + return I - 2 * N * __builtin_hlsl_dot(I, N); +#endif +} + } // namespace __detail } // namespace hlsl #endif //_HLSL_HLSL_DETAILS_H_ diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index d1e4eb08aa7646..3b47074f07ecf4 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -2008,6 +2008,49 @@ double3 rcp(double3); _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rcp) double4 rcp(double4); +//===----------------------------------------------------------------------===// +// reflect builtin +//===----------------------------------------------------------------------===// + +/// \fn T reflect(T I, T N) +/// \brief Returns a reflection using an incident ray, \a I, and a surface +/// normal, \a N. +/// \param I The incident ray. +/// \param N The surface normal. +/// +/// The return value is a floating-point vector that represents the reflection +/// of the incident ray, \a I, off a surface with the normal \a N. +/// +/// This function calculates the reflection vector using the following formula: +/// V = I - 2 * N * dot(I N) . +/// +/// N must already be normalized in order to achieve the desired result. +/// +/// The operands must all be a scalar or vector whose component type is +/// floating-point. +/// +/// Result type and the type of all operands must be the same type. + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +const inline half reflect(half I, half N) { + return __detail::reflect_impl(I, N); +} + +const inline float reflect(float I, float N) { + return __detail::reflect_impl(I, N); +} + +template +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +const inline vector reflect(vector I, vector N) { + return __detail::reflect_vec_impl(I, N); +} + +template +const inline vector reflect(vector I, vector N) { + return __detail::reflect_vec_impl(I, N); +} + //===----------------------------------------------------------------------===// // rsqrt builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaSPIRV.cpp b/clang/lib/Sema/SemaSPIRV.cpp index dc49fc79073572..94534485e07c33 100644 --- a/clang/lib/Sema/SemaSPIRV.cpp +++ b/clang/lib/Sema/SemaSPIRV.cpp @@ -69,6 +69,38 @@ bool SemaSPIRV::CheckSPIRVBuiltinFunctionCall(unsigned BuiltinID, TheCall->setType(RetTy); break; } + case SPIRV::BI__builtin_spirv_reflect: { + if (SemaRef.checkArgCount(TheCall, 2)) + return true; + + ExprResult A = TheCall->getArg(0); + QualType ArgTyA = A.get()->getType(); + auto *VTyA = ArgTyA->getAs(); + if (VTyA == nullptr) { + SemaRef.Diag(A.get()->getBeginLoc(), + diag::err_typecheck_convert_incompatible) + << ArgTyA + << SemaRef.Context.getVectorType(ArgTyA, 2, VectorKind::Generic) << 1 + << 0 << 0; + return true; + } + + ExprResult B = TheCall->getArg(1); + QualType ArgTyB = B.get()->getType(); + auto *VTyB = ArgTyB->getAs(); + if (VTyB == nullptr) { + SemaRef.Diag(A.get()->getBeginLoc(), + diag::err_typecheck_convert_incompatible) + << ArgTyB + << SemaRef.Context.getVectorType(ArgTyB, 2, VectorKind::Generic) << 1 + << 0 << 0; + return true; + } + + QualType RetTy = ArgTyA; + TheCall->setType(RetTy); + break; + } } return false; } diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl new file mode 100644 index 00000000000000..35ee059697c4ba --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl @@ -0,0 +1,177 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -O1 -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK + +// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh( +// CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000 +// CHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[N]], [[N]] +// CHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[TMP0]], [[MUL_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[I]], [[MUL2_I]] +// CHECK-NEXT: ret half [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh( +// SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000 +// SPVCHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[N]], [[N]] +// SPVCHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[TMP0]], [[MUL_I]] +// SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[I]], [[MUL2_I]] +// SPVCHECK-NEXT: ret half [[SUB_I]] +// +half test_reflect_half(half I, half N) { + return reflect(I, N); +} + +// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( +// CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[I]], <2 x half> [[N]]) +// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[DOTSCALAR]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[TMP1]], [[N]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[I]], [[MUL1_I]] +// CHECK-NEXT: ret <2 x half> [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( +// SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> [[I]], <2 x half> [[N]]) +// SPVCHECK-NEXT: ret <2 x half> [[SPV_REFLECT_I]] +// +half2 test_reflect_half2(half2 I, half2 N) { + return reflect(I, N); +} + +// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( +// CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[I]], <3 x half> [[N]]) +// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x half> poison, half [[DOTSCALAR]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <3 x i32> zeroinitializer +// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[TMP1]], [[N]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[I]], [[MUL1_I]] +// CHECK-NEXT: ret <3 x half> [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( +// SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> [[I]], <3 x half> [[N]]) +// SPVCHECK-NEXT: ret <3 x half> [[SPV_REFLECT_I]] +// +half3 test_reflect_half3(half3 I, half3 N) { + return reflect(I, N); +} + +// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( +// CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[I]], <4 x half> [[N]]) +// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[DOTSCALAR]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[TMP1]], [[N]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[I]], [[MUL1_I]] +// CHECK-NEXT: ret <4 x half> [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( +// SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> [[I]], <4 x half> [[N]]) +// SPVCHECK-NEXT: ret <4 x half> [[SPV_REFLECT_I]] +// +half4 test_reflect_half4(half4 I, half4 N) { + return reflect(I, N); +} + +// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_reflect_floatff( +// CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00 +// CHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[N]], [[N]] +// CHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP0]], [[MUL_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[I]], [[MUL2_I]] +// CHECK-NEXT: ret float [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff( +// SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00 +// SPVCHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[N]], [[N]] +// SPVCHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP0]], [[MUL_I]] +// SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[I]], [[MUL2_I]] +// SPVCHECK-NEXT: ret float [[SUB_I]] +// +float test_reflect_float(float I, float N) { + return reflect(I, N); +} + +// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( +// CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[I]], <2 x float> [[N]]) +// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[DOTSCALAR]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[TMP1]], [[N]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[I]], [[MUL1_I]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( +// SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> [[I]], <2 x float> [[N]]) +// SPVCHECK-NEXT: ret <2 x float> [[SPV_REFLECT_I]] +// +float2 test_reflect_float2(float2 I, float2 N) { + return reflect(I, N); +} + +// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( +// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[I]], <3 x float> [[N]]) +// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x float> poison, float [[DOTSCALAR]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <3 x i32> zeroinitializer +// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[TMP1]], [[N]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[I]], [[MUL1_I]] +// CHECK-NEXT: ret <3 x float> [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( +// SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> [[I]], <3 x float> [[N]]) +// SPVCHECK-NEXT: ret <3 x float> [[SPV_REFLECT_I]] +// +float3 test_reflect_float3(float3 I, float3 N) { + return reflect(I, N); +} + +// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( +// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[I]], <4 x float> [[N]]) +// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[DOTSCALAR]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[TMP1]], [[N]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[I]], [[MUL1_I]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// +// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( +// SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SPVCHECK-NEXT: [[ENTRY:.*:]] +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> [[I]], <4 x float> [[N]]) +// SPVCHECK-NEXT: ret <4 x float> [[SPV_REFLECT_I]] +// +float4 test_reflect_float4(float4 I, float4 N) { + return reflect(I, N); +} diff --git a/clang/test/CodeGenSPIRV/Builtins/reflect.c b/clang/test/CodeGenSPIRV/Builtins/reflect.c new file mode 100644 index 00000000000000..f51ac27a07457a --- /dev/null +++ b/clang/test/CodeGenSPIRV/Builtins/reflect.c @@ -0,0 +1,32 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// RUN: %clang_cc1 -O1 -triple spirv-pc-vulkan-compute %s -emit-llvm -o - | FileCheck %s + +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: define spir_func <2 x float> @test_reflect_float2( +// CHECK-SAME: <2 x float> noundef [[X:%.*]], <2 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SPV_REFLECT:%.*]] = tail call <2 x float> @llvm.spv.reflect.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) +// CHECK-NEXT: ret <2 x float> [[SPV_REFLECT]] +// +float2 test_reflect_float2(float2 X, float2 Y) { return __builtin_spirv_reflect(X, Y); } + +// CHECK-LABEL: define spir_func <3 x float> @test_reflect_float3( +// CHECK-SAME: <3 x float> noundef [[X:%.*]], <3 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SPV_REFLECT:%.*]] = tail call <3 x float> @llvm.spv.reflect.v3f32(<3 x float> [[X]], <3 x float> [[Y]]) +// CHECK-NEXT: ret <3 x float> [[SPV_REFLECT]] +// +float3 test_reflect_float3(float3 X, float3 Y) { return __builtin_spirv_reflect(X, Y); } + +// CHECK-LABEL: define spir_func <4 x float> @test_reflect_float4( +// CHECK-SAME: <4 x float> noundef [[X:%.*]], <4 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SPV_REFLECT:%.*]] = tail call <4 x float> @llvm.spv.reflect.v4f32(<4 x float> [[X]], <4 x float> [[Y]]) +// CHECK-NEXT: ret <4 x float> [[SPV_REFLECT]] +// +float4 test_reflect_float4(float4 X, float4 Y) { return __builtin_spirv_reflect(X, Y); } + diff --git a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl new file mode 100644 index 00000000000000..28cf992ed602bf --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl @@ -0,0 +1,33 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify + +float test_no_second_arg(float2 p0) { + return reflect(p0); + // expected-error@-1 {{no matching function for call to 'reflect'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 1 was provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 1 was provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 1 was provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 1 was provided}} +} + +float test_too_many_arg(float2 p0) { + return reflect(p0, p0, p0); + // expected-error@-1 {{no matching function for call to 'reflect'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 3 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 3 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 3 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 3 were provided}} +} + +float test_double_inputs(double p0, double p1) { + return reflect(p0, p1); + // expected-error@-1 {{call to 'reflect' is ambiguous}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} +} + +float test_int_inputs(int p0, int p1) { + return reflect(p0, p1); + // expected-error@-1 {{call to 'reflect' is ambiguous}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} +} diff --git a/clang/test/SemaSPIRV/BuiltIns/reflect-errors.c b/clang/test/SemaSPIRV/BuiltIns/reflect-errors.c new file mode 100644 index 00000000000000..c93dd2ffcc9c3e --- /dev/null +++ b/clang/test/SemaSPIRV/BuiltIns/reflect-errors.c @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 %s -triple spirv-pc-vulkan-compute -verify + +typedef float float2 __attribute__((ext_vector_type(2))); + +float2 test_no_second_arg(float2 p0) { + return __builtin_spirv_reflect(p0); + // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} +} + +float2 test_too_many_arg(float2 p0) { + return __builtin_spirv_reflect(p0, p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +float test_double_scalar_inputs(double p0, double p1) { + return __builtin_spirv_reflect(p0, p1); + // expected-error@-1 {{passing 'double' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(double)))) double' (vector of 2 'double' values)}} +} + +float test_int_scalar_inputs(int p0, int p1) { + return __builtin_spirv_reflect(p0, p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(int)))) int' (vector of 2 'int' values)}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index be337dbccaf8a9..4da464d8010f76 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -71,6 +71,7 @@ let TargetPrefix = "spv" in { [IntrNoMem] >; def int_spv_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; + def int_spv_reflect : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_spv_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [LLVMMatchType<0>, llvm_anyfloat_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index f5409c27d6ea3d..8257ad10dd8c41 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -270,10 +270,8 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectPhi(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - [[maybe_unused]] bool selectExtInst(Register ResVReg, - const SPIRVType *RestType, - MachineInstr &I, - GL::GLSLExtInst GLInst) const; + bool selectExtInst(Register ResVReg, const SPIRVType *RestType, + MachineInstr &I, GL::GLSLExtInst GLInst) const; bool selectExtInst(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, CL::OpenCLExtInst CLInst) const; bool selectExtInst(Register ResVReg, const SPIRVType *ResType, @@ -902,6 +900,14 @@ bool SPIRVInstructionSelector::selectExtInst(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, GL::GLSLExtInst GLInst) const { + if (!STI.canUseExtInstSet( + SPIRV::InstructionSet::InstructionSet::GLSL_std_450)) { + std::string DiagMsg; + raw_string_ostream OS(DiagMsg); + I.print(OS, true, false, false, false); + DiagMsg += " is only supported with the GLSL extended instruction set.\n"; + report_fatal_error(DiagMsg.c_str(), false); + } return selectExtInst(ResVReg, ResType, I, {{SPIRV::InstructionSet::GLSL_std_450, GLInst}}); } @@ -3030,6 +3036,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); + case Intrinsic::spv_reflect: + return selectExtInst(ResVReg, ResType, I, GL::Reflect); case Intrinsic::spv_rsqrt: return selectExtInst(ResVReg, ResType, I, CL::rsqrt, GL::InverseSqrt); case Intrinsic::spv_sign: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll new file mode 100644 index 00000000000000..18962807f84ffc --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll @@ -0,0 +1,33 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for reflect are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef <4 x half> @reflect_half4(<4 x half> noundef %a, <4 x half> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Reflect %[[#arg0]] %[[#arg1]] + %spv.reflect = call <4 x half> @llvm.spv.reflect.f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %spv.reflect +} + +define noundef <4 x float> @reflect_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Reflect %[[#arg0]] %[[#arg1]] + %spv.reflect = call <4 x float> @llvm.spv.reflect.f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %spv.reflect +} + +declare <4 x half> @llvm.spv.reflect.f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.spv.reflect.f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll b/llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll new file mode 100644 index 00000000000000..3b3edc13131f58 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll @@ -0,0 +1,13 @@ +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.reflect), %{{.*}}, %{{.*}} is only supported with the GLSL extended instruction set. + +define noundef <4 x float> @reflect_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: + %spv.reflect = call <4 x float> @llvm.spv.reflect.f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %spv.reflect +} + +declare <4 x float> @llvm.spv.reflect.f32(<4 x float>, <4 x float>) + From 8fb42300a02c887740825cd1b60fc4fcd8d2f933 Mon Sep 17 00:00:00 2001 From: Tom Honermann Date: Wed, 22 Jan 2025 16:39:08 -0500 Subject: [PATCH 041/208] [SYCL] AST support for SYCL kernel entry point functions. (#122379) A SYCL kernel entry point function is a non-member function or a static member function declared with the `sycl_kernel_entry_point` attribute. Such functions define a pattern for an offload kernel entry point function to be generated to enable execution of a SYCL kernel on a device. A SYCL library implementation orchestrates the invocation of these functions with corresponding SYCL kernel arguments in response to calls to SYCL kernel invocation functions specified by the SYCL 2020 specification. The offload kernel entry point function (sometimes referred to as the SYCL kernel caller function) is generated from the SYCL kernel entry point function by a transformation of the function parameters followed by a transformation of the function body to replace references to the original parameters with references to the transformed ones. Exactly how parameters are transformed will be explained in a future change that implements non-trivial transformations. For now, it suffices to state that a given parameter of the SYCL kernel entry point function may be transformed to multiple parameters of the offload kernel entry point as needed to satisfy offload kernel argument passing requirements. Parameters that are decomposed in this way are reconstituted as local variables in the body of the generated offload kernel entry point function. For example, given the following SYCL kernel entry point function definition: ``` template [[clang::sycl_kernel_entry_point(KernelNameType)]] void sycl_kernel_entry_point(KernelType kernel) { kernel(); } ``` and the following call: ``` struct Kernel { int dm1; int dm2; void operator()() const; }; Kernel k; sycl_kernel_entry_point(k); ``` the corresponding offload kernel entry point function that is generated might look as follows (assuming `Kernel` is a type that requires decomposition): ``` void offload_kernel_entry_point_for_kernel_name(int dm1, int dm2) { Kernel kernel{dm1, dm2}; kernel(); } ``` Other details of the generated offload kernel entry point function, such as its name and calling convention, are implementation details that need not be reflected in the AST and may differ across target devices. For that reason, only the transformation described above is represented in the AST; other details will be filled in during code generation. These transformations are represented using new AST nodes introduced with this change. `OutlinedFunctionDecl` holds a sequence of `ImplicitParamDecl` nodes and a sequence of statement nodes that correspond to the transformed parameters and function body. `SYCLKernelCallStmt` wraps the original function body and associates it with an `OutlinedFunctionDecl` instance. For the example above, the AST generated for the `sycl_kernel_entry_point` specialization would look as follows: ``` FunctionDecl 'sycl_kernel_entry_point(Kernel)' TemplateArgument type 'kernel_name' TemplateArgument type 'Kernel' ParmVarDecl kernel 'Kernel' SYCLKernelCallStmt CompoundStmt OutlinedFunctionDecl ImplicitParamDecl 'dm1' 'int' ImplicitParamDecl 'dm2' 'int' CompoundStmt VarDecl 'kernel' 'Kernel' ``` Any ODR-use of the SYCL kernel entry point function will (with future changes) suffice for the offload kernel entry point to be emitted. An actual call to the SYCL kernel entry point function will result in a call to the function. However, evaluation of a `SYCLKernelCallStmt` statement is a no-op, so such calls will have no effect other than to trigger emission of the offload kernel entry point. Additionally, as a related change inspired by code review feedback, these changes disallow use of the `sycl_kernel_entry_point` attribute with functions defined with a _function-try-block_. The SYCL 2020 specification prohibits the use of C++ exceptions in device functions. Even if exceptions were not prohibited, it is unclear what the semantics would be for an exception that escapes the SYCL kernel entry point function; the boundary between host and device code could be an implicit noexcept boundary that results in program termination if violated, or the exception could perhaps be propagated to host code via the SYCL library. Pending support for C++ exceptions in device code and clear semantics for handling them at the host-device boundary, this change makes use of the `sycl_kernel_entry_point` attribute with a function defined with a _function-try-block_ an error. --- clang/include/clang/AST/ASTNodeTraverser.h | 16 +- clang/include/clang/AST/Decl.h | 77 +++++ clang/include/clang/AST/RecursiveASTVisitor.h | 14 + clang/include/clang/AST/StmtSYCL.h | 94 ++++++ clang/include/clang/AST/StmtVisitor.h | 1 + clang/include/clang/Basic/AttrDocs.td | 1 + clang/include/clang/Basic/DeclNodes.td | 1 + .../clang/Basic/DiagnosticSemaKinds.td | 3 +- clang/include/clang/Basic/StmtNodes.td | 1 + clang/include/clang/Sema/SemaSYCL.h | 1 + clang/include/clang/Sema/Template.h | 5 +- .../include/clang/Serialization/ASTBitCodes.h | 6 + clang/lib/AST/ASTStructuralEquivalence.cpp | 1 + clang/lib/AST/Decl.cpp | 29 ++ clang/lib/AST/DeclBase.cpp | 4 + clang/lib/AST/Stmt.cpp | 1 + clang/lib/AST/StmtPrinter.cpp | 5 + clang/lib/AST/StmtProfile.cpp | 4 + clang/lib/CodeGen/CGDecl.cpp | 1 + clang/lib/CodeGen/CGStmt.cpp | 18 ++ clang/lib/CodeGen/CodeGenFunction.h | 1 + clang/lib/Sema/SemaDecl.cpp | 15 +- clang/lib/Sema/SemaExceptionSpec.cpp | 1 + clang/lib/Sema/SemaSYCL.cpp | 92 ++++++ clang/lib/Sema/TreeTransform.h | 16 +- clang/lib/Serialization/ASTCommon.cpp | 2 + clang/lib/Serialization/ASTReaderDecl.cpp | 13 + clang/lib/Serialization/ASTReaderStmt.cpp | 11 + clang/lib/Serialization/ASTWriterDecl.cpp | 11 + clang/lib/Serialization/ASTWriterStmt.cpp | 8 + clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 1 + .../ast-dump-sycl-kernel-call-stmt.cpp | 275 ++++++++++++++++++ .../ast-dump-sycl-kernel-entry-point.cpp | 6 +- ...-kernel-entry-point-attr-appertainment.cpp | 9 + clang/tools/libclang/CIndex.cpp | 1 + clang/tools/libclang/CXCursor.cpp | 4 + 36 files changed, 737 insertions(+), 12 deletions(-) create mode 100644 clang/include/clang/AST/StmtSYCL.h create mode 100644 clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h index f5652b295de168..3bc0bdff2bdd12 100644 --- a/clang/include/clang/AST/ASTNodeTraverser.h +++ b/clang/include/clang/AST/ASTNodeTraverser.h @@ -158,8 +158,8 @@ class ASTNodeTraverser ConstStmtVisitor::Visit(S); // Some statements have custom mechanisms for dumping their children. - if (isa(S) || isa(S) || - isa(S) || isa(S)) + if (isa(S)) return; if (Traversal == TK_IgnoreUnlessSpelledInSource && @@ -585,6 +585,12 @@ class ASTNodeTraverser void VisitTopLevelStmtDecl(const TopLevelStmtDecl *D) { Visit(D->getStmt()); } + void VisitOutlinedFunctionDecl(const OutlinedFunctionDecl *D) { + for (const ImplicitParamDecl *Parameter : D->parameters()) + Visit(Parameter); + Visit(D->getBody()); + } + void VisitCapturedDecl(const CapturedDecl *D) { Visit(D->getBody()); } void VisitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) { @@ -815,6 +821,12 @@ class ASTNodeTraverser Visit(Node->getCapturedDecl()); } + void VisitSYCLKernelCallStmt(const SYCLKernelCallStmt *Node) { + Visit(Node->getOriginalStmt()); + if (Traversal != TK_IgnoreUnlessSpelledInSource) + Visit(Node->getOutlinedFunctionDecl()); + } + void VisitOMPExecutableDirective(const OMPExecutableDirective *Node) { for (const auto *C : Node->clauses()) Visit(C); diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 186a3e7fca59db..d01681483a9189 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -4688,6 +4688,83 @@ class BlockDecl : public Decl, public DeclContext { } }; +/// Represents a partial function definition. +/// +/// An outlined function declaration contains the parameters and body of +/// a function independent of other function definition concerns such +/// as function name, type, and calling convention. Such declarations may +/// be used to hold a parameterized and transformed sequence of statements +/// used to generate a target dependent function definition without losing +/// association with the original statements. See SYCLKernelCallStmt as an +/// example. +class OutlinedFunctionDecl final + : public Decl, + public DeclContext, + private llvm::TrailingObjects { +private: + /// The number of parameters to the outlined function. + unsigned NumParams; + + /// The body of the outlined function. + llvm::PointerIntPair BodyAndNothrow; + + explicit OutlinedFunctionDecl(DeclContext *DC, unsigned NumParams); + + ImplicitParamDecl *const *getParams() const { + return getTrailingObjects(); + } + + ImplicitParamDecl **getParams() { + return getTrailingObjects(); + } + +public: + friend class ASTDeclReader; + friend class ASTDeclWriter; + friend TrailingObjects; + + static OutlinedFunctionDecl *Create(ASTContext &C, DeclContext *DC, + unsigned NumParams); + static OutlinedFunctionDecl * + CreateDeserialized(ASTContext &C, GlobalDeclID ID, unsigned NumParams); + + Stmt *getBody() const override; + void setBody(Stmt *B); + + bool isNothrow() const; + void setNothrow(bool Nothrow = true); + + unsigned getNumParams() const { return NumParams; } + + ImplicitParamDecl *getParam(unsigned i) const { + assert(i < NumParams); + return getParams()[i]; + } + void setParam(unsigned i, ImplicitParamDecl *P) { + assert(i < NumParams); + getParams()[i] = P; + } + + // Range interface to parameters. + using parameter_const_iterator = const ImplicitParamDecl *const *; + using parameter_const_range = llvm::iterator_range; + parameter_const_range parameters() const { + return {param_begin(), param_end()}; + } + parameter_const_iterator param_begin() const { return getParams(); } + parameter_const_iterator param_end() const { return getParams() + NumParams; } + + // Implement isa/cast/dyncast/etc. + static bool classof(const Decl *D) { return classofKind(D->getKind()); } + static bool classofKind(Kind K) { return K == OutlinedFunction; } + static DeclContext *castToDeclContext(const OutlinedFunctionDecl *D) { + return static_cast(const_cast(D)); + } + static OutlinedFunctionDecl *castFromDeclContext(const DeclContext *DC) { + return static_cast(const_cast(DC)); + } +}; + /// Represents the body of a CapturedStmt, and serves as its DeclContext. class CapturedDecl final : public Decl, diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index d500f4eadef757..c4a1d03f1b3d10 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -37,6 +37,7 @@ #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/AST/TemplateBase.h" #include "clang/AST/TemplateName.h" #include "clang/AST/Type.h" @@ -1581,6 +1582,11 @@ DEF_TRAVERSE_DECL(BlockDecl, { ShouldVisitChildren = false; }) +DEF_TRAVERSE_DECL(OutlinedFunctionDecl, { + TRY_TO(TraverseStmt(D->getBody())); + ShouldVisitChildren = false; +}) + DEF_TRAVERSE_DECL(CapturedDecl, { TRY_TO(TraverseStmt(D->getBody())); ShouldVisitChildren = false; @@ -2904,6 +2910,14 @@ DEF_TRAVERSE_STMT(SEHFinallyStmt, {}) DEF_TRAVERSE_STMT(SEHLeaveStmt, {}) DEF_TRAVERSE_STMT(CapturedStmt, { TRY_TO(TraverseDecl(S->getCapturedDecl())); }) +DEF_TRAVERSE_STMT(SYCLKernelCallStmt, { + if (getDerived().shouldVisitImplicitCode()) { + TRY_TO(TraverseStmt(S->getOriginalStmt())); + TRY_TO(TraverseDecl(S->getOutlinedFunctionDecl())); + ShouldVisitChildren = false; + } +}) + DEF_TRAVERSE_STMT(CXXOperatorCallExpr, {}) DEF_TRAVERSE_STMT(CXXRewrittenBinaryOperator, { if (!getDerived().shouldVisitImplicitCode()) { diff --git a/clang/include/clang/AST/StmtSYCL.h b/clang/include/clang/AST/StmtSYCL.h new file mode 100644 index 00000000000000..28ace12d7916b4 --- /dev/null +++ b/clang/include/clang/AST/StmtSYCL.h @@ -0,0 +1,94 @@ +//===- StmtSYCL.h - Classes for SYCL kernel calls ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines SYCL AST classes used to represent calls to SYCL kernels. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_STMTSYCL_H +#define LLVM_CLANG_AST_STMTSYCL_H + +#include "clang/AST/ASTContext.h" +#include "clang/AST/Decl.h" +#include "clang/AST/Stmt.h" +#include "clang/Basic/SourceLocation.h" + +namespace clang { + +//===----------------------------------------------------------------------===// +// AST classes for SYCL kernel calls. +//===----------------------------------------------------------------------===// + +/// SYCLKernelCallStmt represents the transformation that is applied to the body +/// of a function declared with the sycl_kernel_entry_point attribute. The body +/// of such a function specifies the statements to be executed on a SYCL device +/// to invoke a SYCL kernel with a particular set of kernel arguments. The +/// SYCLKernelCallStmt associates an original statement (the compound statement +/// that is the function body) with an OutlinedFunctionDecl that holds the +/// kernel parameters and the transformed body. During code generation, the +/// OutlinedFunctionDecl is used to emit an offload kernel entry point suitable +/// for invocation from a SYCL library implementation. If executed, the +/// SYCLKernelCallStmt behaves as a no-op; no code generation is performed for +/// it. +class SYCLKernelCallStmt : public Stmt { + friend class ASTStmtReader; + friend class ASTStmtWriter; + +private: + Stmt *OriginalStmt = nullptr; + OutlinedFunctionDecl *OFDecl = nullptr; + +public: + /// Construct a SYCL kernel call statement. + SYCLKernelCallStmt(CompoundStmt *CS, OutlinedFunctionDecl *OFD) + : Stmt(SYCLKernelCallStmtClass), OriginalStmt(CS), OFDecl(OFD) {} + + /// Construct an empty SYCL kernel call statement. + SYCLKernelCallStmt(EmptyShell Empty) : Stmt(SYCLKernelCallStmtClass, Empty) {} + + /// Retrieve the model statement. + CompoundStmt *getOriginalStmt() { return cast(OriginalStmt); } + const CompoundStmt *getOriginalStmt() const { + return cast(OriginalStmt); + } + void setOriginalStmt(CompoundStmt *CS) { OriginalStmt = CS; } + + /// Retrieve the outlined function declaration. + OutlinedFunctionDecl *getOutlinedFunctionDecl() { return OFDecl; } + const OutlinedFunctionDecl *getOutlinedFunctionDecl() const { return OFDecl; } + + /// Set the outlined function declaration. + void setOutlinedFunctionDecl(OutlinedFunctionDecl *OFD) { OFDecl = OFD; } + + SourceLocation getBeginLoc() const LLVM_READONLY { + return getOriginalStmt()->getBeginLoc(); + } + + SourceLocation getEndLoc() const LLVM_READONLY { + return getOriginalStmt()->getEndLoc(); + } + + SourceRange getSourceRange() const LLVM_READONLY { + return getOriginalStmt()->getSourceRange(); + } + + static bool classof(const Stmt *T) { + return T->getStmtClass() == SYCLKernelCallStmtClass; + } + + child_range children() { + return child_range(&OriginalStmt, &OriginalStmt + 1); + } + + const_child_range children() const { + return const_child_range(&OriginalStmt, &OriginalStmt + 1); + } +}; + +} // end namespace clang + +#endif // LLVM_CLANG_AST_STMTSYCL_H diff --git a/clang/include/clang/AST/StmtVisitor.h b/clang/include/clang/AST/StmtVisitor.h index 990aa2df180d43..8b7b728deaff27 100644 --- a/clang/include/clang/AST/StmtVisitor.h +++ b/clang/include/clang/AST/StmtVisitor.h @@ -22,6 +22,7 @@ #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/Basic/LLVM.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Casting.h" diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 56a817892bbbaa..a8b588169725a2 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -487,6 +487,7 @@ following requirements. * Is not a C variadic function. * Is not a coroutine. * Is not defined as deleted or as defaulted. +* Is not defined with a function try block. * Is not declared with the ``constexpr`` or ``consteval`` specifiers. * Is not declared with the ``[[noreturn]]`` attribute. diff --git a/clang/include/clang/Basic/DeclNodes.td b/clang/include/clang/Basic/DeclNodes.td index 48396e85c5adac..723113dc2486e0 100644 --- a/clang/include/clang/Basic/DeclNodes.td +++ b/clang/include/clang/Basic/DeclNodes.td @@ -101,6 +101,7 @@ def Friend : DeclNode; def FriendTemplate : DeclNode; def StaticAssert : DeclNode; def Block : DeclNode, DeclContext; +def OutlinedFunction : DeclNode, DeclContext; def Captured : DeclNode, DeclContext; def Import : DeclNode; def OMPThreadPrivate : DeclNode; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0175c20daf241e..36b693c6a304e7 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12457,7 +12457,8 @@ def err_sycl_entry_point_invalid : Error< "'sycl_kernel_entry_point' attribute cannot be applied to a" " %select{non-static member function|variadic function|deleted function|" "defaulted function|constexpr function|consteval function|" - "function declared with the 'noreturn' attribute|coroutine}0">; + "function declared with the 'noreturn' attribute|coroutine|" + "function defined with a function try block}0">; def err_sycl_entry_point_invalid_redeclaration : Error< "'sycl_kernel_entry_point' kernel name argument does not match prior" " declaration%diff{: $ vs $|}0,1">; diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index ce2c48bd3c84e9..53fc77bbbcecc1 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -24,6 +24,7 @@ def SwitchCase : StmtNode; def CaseStmt : StmtNode; def DefaultStmt : StmtNode; def CapturedStmt : StmtNode; +def SYCLKernelCallStmt : StmtNode; // Statements that might produce a value (for example, as the last non-null // statement in a GNU statement-expression). diff --git a/clang/include/clang/Sema/SemaSYCL.h b/clang/include/clang/Sema/SemaSYCL.h index 5bb0de40c886c7..b47b2f155ef93b 100644 --- a/clang/include/clang/Sema/SemaSYCL.h +++ b/clang/include/clang/Sema/SemaSYCL.h @@ -65,6 +65,7 @@ class SemaSYCL : public SemaBase { void handleKernelEntryPointAttr(Decl *D, const ParsedAttr &AL); void CheckSYCLEntryPointFunctionDecl(FunctionDecl *FD); + StmtResult BuildSYCLKernelCallStmt(FunctionDecl *FD, CompoundStmt *Body); }; } // namespace clang diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h index 9800f75f676aaf..4206bd50b13dd6 100644 --- a/clang/include/clang/Sema/Template.h +++ b/clang/include/clang/Sema/Template.h @@ -627,7 +627,10 @@ enum class TemplateSubstitutionKind : char { #define EMPTY(DERIVED, BASE) #define LIFETIMEEXTENDEDTEMPORARY(DERIVED, BASE) - // Decls which use special-case instantiation code. +// Decls which never appear inside a template. +#define OUTLINEDFUNCTION(DERIVED, BASE) + +// Decls which use special-case instantiation code. #define BLOCK(DERIVED, BASE) #define CAPTURED(DERIVED, BASE) #define IMPLICITPARAM(DERIVED, BASE) diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index d568d2fd7aa301..1b56ed2c9776b5 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1316,6 +1316,9 @@ enum DeclCode { /// A BlockDecl record. DECL_BLOCK, + /// A OutlinedFunctionDecl record. + DECL_OUTLINEDFUNCTION, + /// A CapturedDecl record. DECL_CAPTURED, @@ -1600,6 +1603,9 @@ enum StmtCode { /// A CapturedStmt record. STMT_CAPTURED, + /// A SYCLKernelCallStmt record. + STMT_SYCLKERNELCALL, + /// A GCC-style AsmStmt record. STMT_GCCASM, diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index 308551c3061510..eaf0748395268b 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -76,6 +76,7 @@ #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/AST/TemplateBase.h" #include "clang/AST/TemplateName.h" #include "clang/AST/Type.h" diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index ddde16ada5af88..5ce03ce20d2841 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -5459,6 +5459,35 @@ BlockDecl *BlockDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { return new (C, ID) BlockDecl(nullptr, SourceLocation()); } +OutlinedFunctionDecl::OutlinedFunctionDecl(DeclContext *DC, unsigned NumParams) + : Decl(OutlinedFunction, DC, SourceLocation()), + DeclContext(OutlinedFunction), NumParams(NumParams), + BodyAndNothrow(nullptr, false) {} + +OutlinedFunctionDecl *OutlinedFunctionDecl::Create(ASTContext &C, + DeclContext *DC, + unsigned NumParams) { + return new (C, DC, additionalSizeToAlloc(NumParams)) + OutlinedFunctionDecl(DC, NumParams); +} + +OutlinedFunctionDecl * +OutlinedFunctionDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, + unsigned NumParams) { + return new (C, ID, additionalSizeToAlloc(NumParams)) + OutlinedFunctionDecl(nullptr, NumParams); +} + +Stmt *OutlinedFunctionDecl::getBody() const { + return BodyAndNothrow.getPointer(); +} +void OutlinedFunctionDecl::setBody(Stmt *B) { BodyAndNothrow.setPointer(B); } + +bool OutlinedFunctionDecl::isNothrow() const { return BodyAndNothrow.getInt(); } +void OutlinedFunctionDecl::setNothrow(bool Nothrow) { + BodyAndNothrow.setInt(Nothrow); +} + CapturedDecl::CapturedDecl(DeclContext *DC, unsigned NumParams) : Decl(Captured, DC, SourceLocation()), DeclContext(Captured), NumParams(NumParams), ContextParam(0), BodyAndNothrow(nullptr, false) {} diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index 2886aebdf52e9b..8506b95f761fe5 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -966,6 +966,7 @@ unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) { case PragmaDetectMismatch: case Block: case Captured: + case OutlinedFunction: case TranslationUnit: case ExternCContext: case Decomposition: @@ -1245,6 +1246,8 @@ template static Decl *getNonClosureContext(T *D) { return getNonClosureContext(BD->getParent()); if (auto *CD = dyn_cast(D)) return getNonClosureContext(CD->getParent()); + if (auto *OFD = dyn_cast(D)) + return getNonClosureContext(OFD->getParent()); return nullptr; } @@ -1437,6 +1440,7 @@ DeclContext *DeclContext::getPrimaryContext() { case Decl::TopLevelStmt: case Decl::Block: case Decl::Captured: + case Decl::OutlinedFunction: case Decl::OMPDeclareReduction: case Decl::OMPDeclareMapper: case Decl::RequiresExprBody: diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp index d6a351a78c7ba8..685c00d0cb44f8 100644 --- a/clang/lib/AST/Stmt.cpp +++ b/clang/lib/AST/Stmt.cpp @@ -25,6 +25,7 @@ #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/AST/Type.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/LLVM.h" diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index 52bcb5135d3513..b5def6fbe525c3 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -30,6 +30,7 @@ #include "clang/AST/StmtCXX.h" #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/AST/StmtVisitor.h" #include "clang/AST/TemplateBase.h" #include "clang/AST/Type.h" @@ -582,6 +583,10 @@ void StmtPrinter::VisitCapturedStmt(CapturedStmt *Node) { PrintStmt(Node->getCapturedDecl()->getBody()); } +void StmtPrinter::VisitSYCLKernelCallStmt(SYCLKernelCallStmt *Node) { + PrintStmt(Node->getOutlinedFunctionDecl()->getBody()); +} + void StmtPrinter::VisitObjCAtTryStmt(ObjCAtTryStmt *Node) { Indent() << "@try"; if (auto *TS = dyn_cast(Node->getTryBody())) { diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 0f1ebc68a4f762..85b59f714ba845 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -392,6 +392,10 @@ void StmtProfiler::VisitCapturedStmt(const CapturedStmt *S) { VisitStmt(S); } +void StmtProfiler::VisitSYCLKernelCallStmt(const SYCLKernelCallStmt *S) { + VisitStmt(S); +} + void StmtProfiler::VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S) { VisitStmt(S); } diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index f85e0b2c404f92..60f67d4640370d 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -97,6 +97,7 @@ void CodeGenFunction::EmitDecl(const Decl &D) { case Decl::Friend: case Decl::FriendTemplate: case Decl::Block: + case Decl::OutlinedFunction: case Decl::Captured: case Decl::UsingShadow: case Decl::ConstructorUsingShadow: diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 10d44e9c90b27e..7c944fe85a352d 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -114,6 +114,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { case Stmt::DefaultStmtClass: case Stmt::CaseStmtClass: case Stmt::SEHLeaveStmtClass: + case Stmt::SYCLKernelCallStmtClass: llvm_unreachable("should have emitted these statements as simple"); #define STMT(Type, Base) @@ -528,6 +529,23 @@ bool CodeGenFunction::EmitSimpleStmt(const Stmt *S, case Stmt::SEHLeaveStmtClass: EmitSEHLeaveStmt(cast(*S)); break; + case Stmt::SYCLKernelCallStmtClass: + // SYCL kernel call statements are generated as wrappers around the body + // of functions declared with the sycl_kernel_entry_point attribute. Such + // functions are used to specify how a SYCL kernel (a function object) is + // to be invoked; the SYCL kernel call statement contains a transformed + // variation of the function body and is used to generate a SYCL kernel + // caller function; a function that serves as the device side entry point + // used to execute the SYCL kernel. The sycl_kernel_entry_point attributed + // function is invoked by host code in order to trigger emission of the + // device side SYCL kernel caller function and to generate metadata needed + // by SYCL run-time library implementations; the function is otherwise + // intended to have no effect. As such, the function body is not evaluated + // as part of the invocation during host compilation (and the function + // should not be called or emitted during device compilation); the SYCL + // kernel call statement is thus handled as a null statement for the + // purpose of code generation. + break; } return true; } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index ba8ed040477cc9..fab27d4c22ed80 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -28,6 +28,7 @@ #include "clang/AST/ExprOpenMP.h" #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/AST/Type.h" #include "clang/Basic/ABI.h" #include "clang/Basic/CapturedStmt.h" diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 4b56a4dea05e5c..ad49eac66e98e5 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -15966,7 +15966,8 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, CheckCoroutineWrapper(FD); } - // Diagnose invalid SYCL kernel entry point function declarations. + // Diagnose invalid SYCL kernel entry point function declarations + // and build SYCLKernelCallStmts for valid ones. if (FD && !FD->isInvalidDecl() && FD->hasAttr()) { SYCLKernelEntryPointAttr *SKEPAttr = FD->getAttr(); @@ -15982,6 +15983,18 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, Diag(SKEPAttr->getLocation(), diag::err_sycl_entry_point_invalid) << /*coroutine*/ 7; SKEPAttr->setInvalidAttr(); + } else if (Body && isa(Body)) { + Diag(SKEPAttr->getLocation(), diag::err_sycl_entry_point_invalid) + << /*function defined with a function try block*/ 8; + SKEPAttr->setInvalidAttr(); + } + + if (Body && !FD->isTemplated() && !SKEPAttr->isInvalidAttr()) { + StmtResult SR = + SYCL().BuildSYCLKernelCallStmt(FD, cast(Body)); + if (SR.isInvalid()) + return nullptr; + Body = SR.get(); } } diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 254ad05c5ba74a..470d0d753b5580 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1427,6 +1427,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Stmt::AttributedStmtClass: case Stmt::BreakStmtClass: case Stmt::CapturedStmtClass: + case Stmt::SYCLKernelCallStmtClass: case Stmt::CaseStmtClass: case Stmt::CompoundStmtClass: case Stmt::ContinueStmtClass: diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index ce53990fdcb18f..5efbd66c66f8d4 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -9,8 +9,10 @@ //===----------------------------------------------------------------------===// #include "clang/Sema/SemaSYCL.h" +#include "TreeTransform.h" #include "clang/AST/Mangle.h" #include "clang/AST/SYCLKernelInfo.h" +#include "clang/AST/StmtSYCL.h" #include "clang/AST/TypeOrdering.h" #include "clang/Basic/Diagnostic.h" #include "clang/Sema/Attr.h" @@ -362,3 +364,93 @@ void SemaSYCL::CheckSYCLEntryPointFunctionDecl(FunctionDecl *FD) { } } } + +namespace { + +// The body of a function declared with the [[sycl_kernel_entry_point]] +// attribute is cloned and transformed to substitute references to the original +// function parameters with references to replacement variables that stand in +// for SYCL kernel parameters or local variables that reconstitute a decomposed +// SYCL kernel argument. +class OutlinedFunctionDeclBodyInstantiator + : public TreeTransform { +public: + using ParmDeclMap = llvm::DenseMap; + + OutlinedFunctionDeclBodyInstantiator(Sema &S, ParmDeclMap &M) + : TreeTransform(S), SemaRef(S), + MapRef(M) {} + + // A new set of AST nodes is always required. + bool AlwaysRebuild() { return true; } + + // Transform ParmVarDecl references to the supplied replacement variables. + ExprResult TransformDeclRefExpr(DeclRefExpr *DRE) { + const ParmVarDecl *PVD = dyn_cast(DRE->getDecl()); + if (PVD) { + ParmDeclMap::iterator I = MapRef.find(PVD); + if (I != MapRef.end()) { + VarDecl *VD = I->second; + assert(SemaRef.getASTContext().hasSameUnqualifiedType(PVD->getType(), + VD->getType())); + assert(!VD->getType().isMoreQualifiedThan(PVD->getType(), + SemaRef.getASTContext())); + VD->setIsUsed(); + return DeclRefExpr::Create( + SemaRef.getASTContext(), DRE->getQualifierLoc(), + DRE->getTemplateKeywordLoc(), VD, false, DRE->getNameInfo(), + DRE->getType(), DRE->getValueKind()); + } + } + return DRE; + } + +private: + Sema &SemaRef; + ParmDeclMap &MapRef; +}; + +} // unnamed namespace + +StmtResult SemaSYCL::BuildSYCLKernelCallStmt(FunctionDecl *FD, + CompoundStmt *Body) { + assert(!FD->isInvalidDecl()); + assert(!FD->isTemplated()); + assert(FD->hasPrototype()); + + const auto *SKEPAttr = FD->getAttr(); + assert(SKEPAttr && "Missing sycl_kernel_entry_point attribute"); + assert(!SKEPAttr->isInvalidAttr() && + "sycl_kernel_entry_point attribute is invalid"); + + // Ensure that the kernel name was previously registered and that the + // stored declaration matches. + const SYCLKernelInfo &SKI = + getASTContext().getSYCLKernelInfo(SKEPAttr->getKernelName()); + assert(declaresSameEntity(SKI.getKernelEntryPointDecl(), FD) && + "SYCL kernel name conflict"); + + using ParmDeclMap = OutlinedFunctionDeclBodyInstantiator::ParmDeclMap; + ParmDeclMap ParmMap; + + assert(SemaRef.CurContext == FD); + OutlinedFunctionDecl *OFD = + OutlinedFunctionDecl::Create(getASTContext(), FD, FD->getNumParams()); + unsigned i = 0; + for (ParmVarDecl *PVD : FD->parameters()) { + ImplicitParamDecl *IPD = ImplicitParamDecl::Create( + getASTContext(), OFD, SourceLocation(), PVD->getIdentifier(), + PVD->getType(), ImplicitParamKind::Other); + OFD->setParam(i, IPD); + ParmMap[PVD] = IPD; + ++i; + } + + OutlinedFunctionDeclBodyInstantiator OFDBodyInstantiator(SemaRef, ParmMap); + Stmt *OFDBody = OFDBodyInstantiator.TransformStmt(Body).get(); + OFD->setBody(OFDBody); + OFD->setNothrow(); + Stmt *NewBody = new (getASTContext()) SYCLKernelCallStmt(Body, OFD); + + return NewBody; +} diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 7dc88a1ae23b98..12680843a434a0 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -29,6 +29,7 @@ #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/Basic/DiagnosticParse.h" #include "clang/Basic/OpenMPKinds.h" #include "clang/Sema/Designator.h" @@ -17057,10 +17058,9 @@ QualType TreeTransform::RebuildDependentSizedMatrixType( AttributeLoc); } -template +template QualType TreeTransform::RebuildFunctionProtoType( - QualType T, - MutableArrayRef ParamTypes, + QualType T, MutableArrayRef ParamTypes, const FunctionProtoType::ExtProtoInfo &EPI) { return SemaRef.BuildFunctionType(T, ParamTypes, getDerived().getBaseLocation(), @@ -17416,6 +17416,16 @@ TreeTransform::TransformCapturedStmt(CapturedStmt *S) { return getSema().ActOnCapturedRegionEnd(Body.get()); } +template +StmtResult +TreeTransform::TransformSYCLKernelCallStmt(SYCLKernelCallStmt *S) { + // SYCLKernelCallStmt nodes are inserted upon completion of a (non-template) + // function definition or instantiation of a function template specialization + // and will therefore never appear in a dependent context. + llvm_unreachable("SYCL kernel call statement cannot appear in dependent " + "context"); +} + template ExprResult TreeTransform::TransformHLSLOutArgExpr(HLSLOutArgExpr *E) { // We can transform the base expression and allow argument resolution to fill diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp index ec18e84255ca8e..3a62c4ea5595be 100644 --- a/clang/lib/Serialization/ASTCommon.cpp +++ b/clang/lib/Serialization/ASTCommon.cpp @@ -338,6 +338,7 @@ serialization::getDefinitiveDeclContext(const DeclContext *DC) { case Decl::CXXConversion: case Decl::ObjCMethod: case Decl::Block: + case Decl::OutlinedFunction: case Decl::Captured: // Objective C categories, category implementations, and class // implementations can only be defined in one place. @@ -439,6 +440,7 @@ bool serialization::isRedeclarableDeclKind(unsigned Kind) { case Decl::FriendTemplate: case Decl::StaticAssert: case Decl::Block: + case Decl::OutlinedFunction: case Decl::Captured: case Decl::Import: case Decl::OMPThreadPrivate: diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 72191395ec8067..0b75468a94103f 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -409,6 +409,7 @@ class ASTDeclReader : public DeclVisitor { void VisitFriendTemplateDecl(FriendTemplateDecl *D); void VisitStaticAssertDecl(StaticAssertDecl *D); void VisitBlockDecl(BlockDecl *BD); + void VisitOutlinedFunctionDecl(OutlinedFunctionDecl *D); void VisitCapturedDecl(CapturedDecl *CD); void VisitEmptyDecl(EmptyDecl *D); void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D); @@ -1795,6 +1796,15 @@ void ASTDeclReader::VisitBlockDecl(BlockDecl *BD) { BD->setCaptures(Reader.getContext(), captures, capturesCXXThis); } +void ASTDeclReader::VisitOutlinedFunctionDecl(OutlinedFunctionDecl *D) { + // NumParams is deserialized by OutlinedFunctionDecl::CreateDeserialized(). + VisitDecl(D); + for (unsigned I = 0; I < D->NumParams; ++I) + D->setParam(I, readDeclAs()); + D->setNothrow(Record.readInt() != 0); + D->setBody(cast_or_null(Record.readStmt())); +} + void ASTDeclReader::VisitCapturedDecl(CapturedDecl *CD) { VisitDecl(CD); unsigned ContextParamPos = Record.readInt(); @@ -4104,6 +4114,9 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { case DECL_TEMPLATE_PARAM_OBJECT: D = TemplateParamObjectDecl::CreateDeserialized(Context, ID); break; + case DECL_OUTLINEDFUNCTION: + D = OutlinedFunctionDecl::CreateDeserialized(Context, ID, Record.readInt()); + break; case DECL_CAPTURED: D = CapturedDecl::CreateDeserialized(Context, ID, Record.readInt()); break; diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 4766f34e9f3a82..990235a310d902 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -33,6 +33,7 @@ #include "clang/AST/StmtCXX.h" #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtSYCL.h" #include "clang/AST/StmtVisitor.h" #include "clang/AST/TemplateBase.h" #include "clang/AST/Type.h" @@ -528,6 +529,12 @@ void ASTStmtReader::VisitCapturedStmt(CapturedStmt *S) { } } +void ASTStmtReader::VisitSYCLKernelCallStmt(SYCLKernelCallStmt *S) { + VisitStmt(S); + S->setOriginalStmt(cast(Record.readSubStmt())); + S->setOutlinedFunctionDecl(readDeclAs()); +} + void ASTStmtReader::VisitExpr(Expr *E) { VisitStmt(E); CurrentUnpackingBits.emplace(Record.readInt()); @@ -3112,6 +3119,10 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { Context, Record[ASTStmtReader::NumStmtFields]); break; + case STMT_SYCLKERNELCALL: + S = new (Context) SYCLKernelCallStmt(Empty); + break; + case EXPR_CONSTANT: S = ConstantExpr::CreateEmpty( Context, static_cast( diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 30b28057f4c10f..54570dedb0b227 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -132,6 +132,7 @@ namespace clang { void VisitFriendTemplateDecl(FriendTemplateDecl *D); void VisitStaticAssertDecl(StaticAssertDecl *D); void VisitBlockDecl(BlockDecl *D); + void VisitOutlinedFunctionDecl(OutlinedFunctionDecl *D); void VisitCapturedDecl(CapturedDecl *D); void VisitEmptyDecl(EmptyDecl *D); void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D); @@ -1377,6 +1378,16 @@ void ASTDeclWriter::VisitBlockDecl(BlockDecl *D) { Code = serialization::DECL_BLOCK; } +void ASTDeclWriter::VisitOutlinedFunctionDecl(OutlinedFunctionDecl *D) { + Record.push_back(D->getNumParams()); + VisitDecl(D); + for (unsigned I = 0; I < D->getNumParams(); ++I) + Record.AddDeclRef(D->getParam(I)); + Record.push_back(D->isNothrow() ? 1 : 0); + Record.AddStmt(D->getBody()); + Code = serialization::DECL_OUTLINEDFUNCTION; +} + void ASTDeclWriter::VisitCapturedDecl(CapturedDecl *CD) { Record.push_back(CD->getNumParams()); VisitDecl(CD); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 7eedf7da7d3fc8..651553244812f2 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -609,6 +609,14 @@ void ASTStmtWriter::VisitCapturedStmt(CapturedStmt *S) { Code = serialization::STMT_CAPTURED; } +void ASTStmtWriter::VisitSYCLKernelCallStmt(SYCLKernelCallStmt *S) { + VisitStmt(S); + Record.AddStmt(S->getOriginalStmt()); + Record.AddDeclRef(S->getOutlinedFunctionDecl()); + + Code = serialization::STMT_SYCLKERNELCALL; +} + void ASTStmtWriter::VisitExpr(Expr *E) { VisitStmt(E); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index ff8bdcea9a2201..140c77790496d9 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1822,6 +1822,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::OMPParallelGenericLoopDirectiveClass: case Stmt::OMPTargetParallelGenericLoopDirectiveClass: case Stmt::CapturedStmtClass: + case Stmt::SYCLKernelCallStmtClass: case Stmt::OpenACCComputeConstructClass: case Stmt::OpenACCLoopConstructClass: case Stmt::OpenACCCombinedConstructClass: diff --git a/clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp b/clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp new file mode 100644 index 00000000000000..27604e237adbb1 --- /dev/null +++ b/clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp @@ -0,0 +1,275 @@ +// Tests without serialization: +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-device \ +// RUN: -ast-dump %s \ +// RUN: | FileCheck --match-full-lines %s +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-host \ +// RUN: -ast-dump %s \ +// RUN: | FileCheck --match-full-lines %s +// +// Tests with serialization: +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-device \ +// RUN: -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-device \ +// RUN: -include-pch %t -ast-dump-all /dev/null \ +// RUN: | sed -e "s/ //" -e "s/ imported//" \ +// RUN: | FileCheck --match-full-lines %s +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-host \ +// RUN: -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-host \ +// RUN: -include-pch %t -ast-dump-all /dev/null \ +// RUN: | sed -e "s/ //" -e "s/ imported//" \ +// RUN: | FileCheck --match-full-lines %s + +// These tests validate the AST body produced for functions declared with the +// sycl_kernel_entry_point attribute. + +// CHECK: TranslationUnitDecl {{.*}} + +// A unique kernel name type is required for each declared kernel entry point. +template struct KN; + +// A unique invocable type for use with each declared kernel entry point. +template struct K { + template + void operator()(Ts...) const {} +}; + + +[[clang::sycl_kernel_entry_point(KN<1>)]] +void skep1() { +} +// CHECK: |-FunctionDecl {{.*}} skep1 'void ()' +// CHECK-NEXT: | |-SYCLKernelCallStmt {{.*}} +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | `-OutlinedFunctionDecl {{.*}} +// CHECK-NEXT: | | `-CompoundStmt {{.*}} +// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<1> + +template +[[clang::sycl_kernel_entry_point(KNT)]] +void skep2(KT k) { + k(); +} +template +void skep2>(K<2>); +// CHECK: |-FunctionTemplateDecl {{.*}} skep2 +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} KNT +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} KT +// CHECK-NEXT: | |-FunctionDecl {{.*}} skep2 'void (KT)' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} k 'KT' +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'KT' lvalue ParmVar {{.*}} 'k' 'KT' +// CHECK-NEXT: | | `-SYCLKernelEntryPointAttr {{.*}} KNT + +// CHECK-NEXT: | `-FunctionDecl {{.*}} skep2 'void (K<2>)' explicit_instantiation_definition +// CHECK-NEXT: | |-TemplateArgument type 'KN<2>' +// CHECK-NEXT: | | `-RecordType {{.*}} 'KN<2>' +// CHECK-NEXT: | | `-ClassTemplateSpecialization {{.*}} 'KN' +// CHECK-NEXT: | |-TemplateArgument type 'K<2>' +// CHECK-NEXT: | | `-RecordType {{.*}} 'K<2>' +// CHECK-NEXT: | | `-ClassTemplateSpecialization {{.*}} 'K' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} k 'K<2>' +// CHECK-NEXT: | |-SYCLKernelCallStmt {{.*}} +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | | `-ImplicitCastExpr {{.*}} 'const K<2>' lvalue +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'K<2>' lvalue ParmVar {{.*}} 'k' 'K<2>' +// CHECK-NEXT: | | `-OutlinedFunctionDecl {{.*}} +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'K<2>' +// CHECK-NEXT: | | `-CompoundStmt {{.*}} +// CHECK-NEXT: | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | `-ImplicitCastExpr {{.*}} 'const K<2>' lvalue +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'K<2>' lvalue ImplicitParam {{.*}} 'k' 'K<2>' +// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<2> + +template +[[clang::sycl_kernel_entry_point(KNT)]] +void skep3(KT k) { + k(); +} +template<> +[[clang::sycl_kernel_entry_point(KN<3>)]] +void skep3>(K<3> k) { + k(); +} +// CHECK: |-FunctionTemplateDecl {{.*}} skep3 +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} KNT +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} KT +// CHECK-NEXT: | |-FunctionDecl {{.*}} skep3 'void (KT)' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} k 'KT' +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'KT' lvalue ParmVar {{.*}} 'k' 'KT' +// CHECK-NEXT: | | `-SYCLKernelEntryPointAttr {{.*}} KNT + +// CHECK-NEXT: | `-Function {{.*}} 'skep3' 'void (K<3>)' +// CHECK-NEXT: |-FunctionDecl {{.*}} skep3 'void (K<3>)' explicit_specialization +// CHECK-NEXT: | |-TemplateArgument type 'KN<3>' +// CHECK-NEXT: | | `-RecordType {{.*}} 'KN<3>' +// CHECK-NEXT: | | `-ClassTemplateSpecialization {{.*}} 'KN' +// CHECK-NEXT: | |-TemplateArgument type 'K<3>' +// CHECK-NEXT: | | `-RecordType {{.*}} 'K<3>' +// CHECK-NEXT: | | `-ClassTemplateSpecialization {{.*}} 'K' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} k 'K<3>' +// CHECK-NEXT: | |-SYCLKernelCallStmt {{.*}} +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | | `-ImplicitCastExpr {{.*}} 'const K<3>' lvalue +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'K<3>' lvalue ParmVar {{.*}} 'k' 'K<3>' +// CHECK-NEXT: | | `-OutlinedFunctionDecl {{.*}} +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'K<3>' +// CHECK-NEXT: | | `-CompoundStmt {{.*}} +// CHECK-NEXT: | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | `-ImplicitCastExpr {{.*}} 'const K<3>' lvalue +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'K<3>' lvalue ImplicitParam {{.*}} 'k' 'K<3>' +// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<3> + +[[clang::sycl_kernel_entry_point(KN<4>)]] +void skep4(K<4> k, int p1, int p2) { + k(p1, p2); +} +// CHECK: |-FunctionDecl {{.*}} skep4 'void (K<4>, int, int)' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} k 'K<4>' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} p1 'int' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} p2 'int' +// CHECK-NEXT: | |-SYCLKernelCallStmt {{.*}} +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | | |-ImplicitCastExpr {{.*}} 'void (*)(int, int) const' +// CHECK-NEXT: | | | | `-DeclRefExpr {{.*}} 'void (int, int) const' lvalue CXXMethod {{.*}} 'operator()' 'void (int, int) const' +// CHECK-NEXT: | | | |-ImplicitCastExpr {{.*}} 'const K<4>' lvalue +// CHECK-NEXT: | | | | `-DeclRefExpr {{.*}} 'K<4>' lvalue ParmVar {{.*}} 'k' 'K<4>' +// CHECK-NEXT: | | | |-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | | | | `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'p1' 'int' +// CHECK-NEXT: | | | `-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'p2' 'int' +// CHECK-NEXT: | | `-OutlinedFunctionDecl {{.*}} +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'K<4>' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used p1 'int' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used p2 'int' +// CHECK-NEXT: | | `-CompoundStmt {{.*}} +// CHECK-NEXT: | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'void (*)(int, int) const' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'void (int, int) const' lvalue CXXMethod {{.*}} 'operator()' 'void (int, int) const' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'const K<4>' lvalue +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'K<4>' lvalue ImplicitParam {{.*}} 'k' 'K<4>' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'int' lvalue ImplicitParam {{.*}} 'p1' 'int' +// CHECK-NEXT: | | `-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'int' lvalue ImplicitParam {{.*}} 'p2' 'int' +// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<4> + +[[clang::sycl_kernel_entry_point(KN<5>)]] +void skep5(int unused1, K<5> k, int unused2, int p, int unused3) { + static int slv = 0; + int lv = 4; + k(slv, 1, p, 3, lv, 5, []{ return 6; }); +} +// CHECK: |-FunctionDecl {{.*}} skep5 'void (int, K<5>, int, int, int)' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} unused1 'int' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} used k 'K<5>' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} unused2 'int' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} used p 'int' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} unused3 'int' +// CHECK-NEXT: | |-SYCLKernelCallStmt {{.*}} +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK: | | `-OutlinedFunctionDecl {{.*}} +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit unused1 'int' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'K<5>' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit unused2 'int' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used p 'int' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit unused3 'int' +// CHECK-NEXT: | | `-CompoundStmt {{.*}} +// CHECK-NEXT: | | |-DeclStmt {{.*}} +// CHECK-NEXT: | | | `-VarDecl {{.*}} used slv 'int' static cinit +// CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 +// CHECK-NEXT: | | |-DeclStmt {{.*}} +// CHECK-NEXT: | | | `-VarDecl {{.*}} used lv 'int' cinit +// CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 4 +// CHECK-NEXT: | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'void (*)(int, int, int, int, int, int, (lambda {{.*}}) const' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'void (int, int, int, int, int, int, (lambda {{.*}})) const' lvalue CXXMethod {{.*}} 'operator()' 'void (int, int, int, int, int, int, (lambda {{.*}})) const' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'const K<5>' lvalue +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'K<5>' lvalue ImplicitParam {{.*}} 'k' 'K<5>' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'slv' 'int' +// CHECK-NEXT: | | |-IntegerLiteral {{.*}} 'int' 1 +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'int' lvalue ImplicitParam {{.*}} 'p' 'int' +// CHECK-NEXT: | | |-IntegerLiteral {{.*}} 'int' 3 +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'lv' 'int' +// CHECK-NEXT: | | |-IntegerLiteral {{.*}} 'int' 5 +// CHECK-NEXT: | | `-LambdaExpr {{.*}} '(lambda {{.*}})' +// CHECK: | `-SYCLKernelEntryPointAttr {{.*}} KN<5> + +struct S6 { + void operator()() const; +}; +[[clang::sycl_kernel_entry_point(KN<6>)]] +void skep6(const S6 &k) { + k(); +} +// CHECK: |-FunctionDecl {{.*}} skep6 'void (const S6 &)' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} used k 'const S6 &' +// CHECK-NEXT: | |-SYCLKernelCallStmt {{.*}} +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'const S6' lvalue ParmVar {{.*}} 'k' 'const S6 &' +// CHECK-NEXT: | | `-OutlinedFunctionDecl {{.*}} +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'const S6 &' +// CHECK-NEXT: | | `-CompoundStmt {{.*}} +// CHECK-NEXT: | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'const S6' lvalue ImplicitParam {{.*}} 'k' 'const S6 &' +// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<6> + +// Parameter types are not required to be complete at the point of a +// non-defining declaration. +struct S7; +[[clang::sycl_kernel_entry_point(KN<7>)]] +void skep7(S7 k); +struct S7 { + void operator()() const; +}; +[[clang::sycl_kernel_entry_point(KN<7>)]] +void skep7(S7 k) { + k(); +} +// CHECK: |-FunctionDecl {{.*}} skep7 'void (S7)' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} k 'S7' +// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<7> +// CHECK: |-FunctionDecl {{.*}} prev {{.*}} skep7 'void (S7)' +// CHECK-NEXT: | |-ParmVarDecl {{.*}} used k 'S7' +// CHECK-NEXT: | |-SYCLKernelCallStmt {{.*}} +// CHECK-NEXT: | | |-CompoundStmt {{.*}} +// CHECK-NEXT: | | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | | `-ImplicitCastExpr {{.*}} 'const S7' lvalue +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'S7' lvalue ParmVar {{.*}} 'k' 'S7' +// CHECK-NEXT: | | `-OutlinedFunctionDecl {{.*}} +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'S7' +// CHECK-NEXT: | | `-CompoundStmt {{.*}} +// CHECK-NEXT: | | `-CXXOperatorCallExpr {{.*}} 'void' '()' +// CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'void (*)() const' +// CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' +// CHECK-NEXT: | | `-ImplicitCastExpr {{.*}} 'const S7' lvalue +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'S7' lvalue ImplicitParam {{.*}} 'k' 'S7' +// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<7> + + +void the_end() {} +// CHECK: `-FunctionDecl {{.*}} the_end 'void ()' diff --git a/clang/test/ASTSYCL/ast-dump-sycl-kernel-entry-point.cpp b/clang/test/ASTSYCL/ast-dump-sycl-kernel-entry-point.cpp index 0189cf0402d3a3..b112e9e1db8505 100644 --- a/clang/test/ASTSYCL/ast-dump-sycl-kernel-entry-point.cpp +++ b/clang/test/ASTSYCL/ast-dump-sycl-kernel-entry-point.cpp @@ -143,16 +143,14 @@ void skep6() { // CHECK: |-FunctionDecl {{.*}} skep6 'void ()' // CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<6> // CHECK-NEXT: |-FunctionDecl {{.*}} prev {{.*}} skep6 'void ()' -// CHECK-NEXT: | |-CompoundStmt {{.*}} -// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<6> +// CHECK: | `-SYCLKernelEntryPointAttr {{.*}} KN<6> // Ensure that matching attributes from the same declaration are ok. [[clang::sycl_kernel_entry_point(KN<7>), clang::sycl_kernel_entry_point(KN<7>)]] void skep7() { } // CHECK: |-FunctionDecl {{.*}} skep7 'void ()' -// CHECK-NEXT: | |-CompoundStmt {{.*}} -// CHECK-NEXT: | |-SYCLKernelEntryPointAttr {{.*}} KN<7> +// CHECK: | |-SYCLKernelEntryPointAttr {{.*}} KN<7> // CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<7> void the_end() {} diff --git a/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-appertainment.cpp b/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-appertainment.cpp index a87af7ca298ac3..d06b40c5bf02d2 100644 --- a/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-appertainment.cpp +++ b/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-appertainment.cpp @@ -350,3 +350,12 @@ auto bad36 = [] [[clang::sycl_kernel_entry_point(BADKN<36>)]] static {}; // expected-error@+1 {{'sycl_kernel_entry_point' attribute cannot be applied to a coroutine}} auto bad37 = [] [[clang::sycl_kernel_entry_point(BADKN<37>)]] static -> void { co_return; }; #endif + +// expected-error@+1 {{'sycl_kernel_entry_point' attribute cannot be applied to a function defined with a function try block}} +[[clang::sycl_kernel_entry_point(BADKN<38>)]] +void bad38() try {} catch(...) {} + +// expected-error@+2 {{'sycl_kernel_entry_point' attribute cannot be applied to a function defined with a function try block}} +template +[[clang::sycl_kernel_entry_point(BADKN<39>)]] +void bad39() try {} catch(...) {} diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index e175aab4499fff..42f095fea2db26 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -7202,6 +7202,7 @@ CXCursor clang_getCursorDefinition(CXCursor C) { case Decl::TopLevelStmt: case Decl::StaticAssert: case Decl::Block: + case Decl::OutlinedFunction: case Decl::Captured: case Decl::OMPCapturedExpr: case Decl::Label: // FIXME: Is this right?? diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp index ee276d8e4e1481..b9f0b089e41b00 100644 --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -375,6 +375,10 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, K = CXCursor_UnexposedStmt; break; + case Stmt::SYCLKernelCallStmtClass: + K = CXCursor_UnexposedStmt; + break; + case Stmt::IntegerLiteralClass: K = CXCursor_IntegerLiteral; break; From 517334bdb83deaae3be6fbc4fa5f1d721b01c0f0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 22 Jan 2025 13:41:42 -0800 Subject: [PATCH 042/208] [TableGen] Add maps from Write/ReadType to the parent WriteRes/ReadAdvance. NFC (#123876) Use this to improve performance of SubtargetEmitter::findWriteResources and SubtargetEmitter::findReadAdvance. Now we can do a map lookup instead of a linear search through all WriteRes/ReadAdvance records. This reduces the build time of RISCVGenSubtargetInfo.inc on my machine from 43 seconds to 10 seconds. --- .../utils/TableGen/Common/CodeGenSchedule.cpp | 16 +++++ llvm/utils/TableGen/Common/CodeGenSchedule.h | 6 ++ llvm/utils/TableGen/SubtargetEmitter.cpp | 63 +++++++++---------- 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 8eaba05e65ce92..2a42262f865cb9 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -2112,6 +2112,14 @@ void CodeGenSchedModels::addWriteRes(const Record *ProcWriteResDef, return; WRDefs.push_back(ProcWriteResDef); + if (ProcWriteResDef->isSubClassOf("WriteRes")) { + auto &WRMap = ProcModels[PIdx].WriteResMap; + const Record *WRDef = ProcWriteResDef->getValueAsDef("WriteType"); + if (!WRMap.try_emplace(WRDef, ProcWriteResDef).second) + PrintFatalError(ProcWriteResDef->getLoc(), + "WriteType already used in another WriteRes"); + } + // Visit ProcResourceKinds referenced by the newly discovered WriteRes. for (const Record *ProcResDef : ProcWriteResDef->getValueAsListOfDefs("ProcResources")) { @@ -2135,6 +2143,14 @@ void CodeGenSchedModels::addReadAdvance(const Record *ProcReadAdvanceDef, if (is_contained(RADefs, ProcReadAdvanceDef)) return; RADefs.push_back(ProcReadAdvanceDef); + + if (ProcReadAdvanceDef->isSubClassOf("ReadAdvance")) { + auto &RAMap = ProcModels[PIdx].ReadAdvanceMap; + const Record *RADef = ProcReadAdvanceDef->getValueAsDef("ReadType"); + if (!RAMap.try_emplace(RADef, ProcReadAdvanceDef).second) + PrintFatalError(ProcReadAdvanceDef->getLoc(), + "ReadType already used in another ReadAdvance"); + } } unsigned CodeGenProcModel::getProcResourceIdx(const Record *PRDef) const { diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.h b/llvm/utils/TableGen/Common/CodeGenSchedule.h index fed8b3e1ccb8a6..467b77e8acba31 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.h +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.h @@ -244,6 +244,12 @@ struct CodeGenProcModel { ConstRecVec WriteResDefs; ConstRecVec ReadAdvanceDefs; + // Map from the WriteType field to the parent WriteRes record. + DenseMap WriteResMap; + + // Map from the ReadType field to the parent ReadAdvance record. + DenseMap ReadAdvanceMap; + // Per-operand machine model resources associated with this processor. ConstRecVec ProcResourceDefs; diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 1120f06875c778..3db3ae65cc5557 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -943,24 +943,23 @@ SubtargetEmitter::findWriteResources(const CodeGenSchedRW &SchedWrite, // Check this processor's list of write resources. const Record *ResDef = nullptr; - for (const Record *WR : ProcModel.WriteResDefs) { - if (!WR->isSubClassOf("WriteRes")) - continue; - const Record *WRDef = WR->getValueAsDef("WriteType"); - if (AliasDef == WRDef || SchedWrite.TheDef == WRDef) { - if (ResDef) { - PrintFatalError(WR->getLoc(), "Resources are defined for both " - "SchedWrite and its alias on processor " + - ProcModel.ModelName); - } - ResDef = WR; - // If there is no AliasDef and we find a match, we can early exit since - // there is no need to verify whether there are resources defined for both - // SchedWrite and its alias. - if (!AliasDef) - break; + + auto I = ProcModel.WriteResMap.find(SchedWrite.TheDef); + if (I != ProcModel.WriteResMap.end()) + ResDef = I->second; + + if (AliasDef) { + I = ProcModel.WriteResMap.find(AliasDef); + if (I != ProcModel.WriteResMap.end()) { + if (ResDef) + PrintFatalError(I->second->getLoc(), + "Resources are defined for both SchedWrite and its " + "alias on processor " + + ProcModel.ModelName); + ResDef = I->second; } } + // TODO: If ProcModel has a base model (previous generation processor), // then call FindWriteResources recursively with that model here. if (!ResDef) { @@ -1003,24 +1002,24 @@ SubtargetEmitter::findReadAdvance(const CodeGenSchedRW &SchedRead, // Check this processor's ReadAdvanceList. const Record *ResDef = nullptr; - for (const Record *RA : ProcModel.ReadAdvanceDefs) { - if (!RA->isSubClassOf("ReadAdvance")) - continue; - const Record *RADef = RA->getValueAsDef("ReadType"); - if (AliasDef == RADef || SchedRead.TheDef == RADef) { - if (ResDef) { - PrintFatalError(RA->getLoc(), "Resources are defined for both " - "SchedRead and its alias on processor " + - ProcModel.ModelName); - } - ResDef = RA; - // If there is no AliasDef and we find a match, we can early exit since - // there is no need to verify whether there are resources defined for both - // SchedRead and its alias. - if (!AliasDef) - break; + + auto I = ProcModel.ReadAdvanceMap.find(SchedRead.TheDef); + if (I != ProcModel.ReadAdvanceMap.end()) + ResDef = I->second; + + if (AliasDef) { + I = ProcModel.ReadAdvanceMap.find(AliasDef); + if (I != ProcModel.ReadAdvanceMap.end()) { + if (ResDef) + PrintFatalError( + I->second->getLoc(), + "Resources are defined for both SchedRead and its alias on " + "processor " + + ProcModel.ModelName); + ResDef = I->second; } } + // TODO: If ProcModel has a base model (previous generation processor), // then call FindReadAdvance recursively with that model here. if (!ResDef && SchedRead.TheDef->getName() != "ReadDefault") { From ac94fade6075fec89eb29c7dedf01ef59601e61d Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 22 Jan 2025 23:16:49 +0100 Subject: [PATCH 043/208] [SPIR-V] Rename internal command line flags for optimization level and mtriple used when passing options into the translate API call (#123975) Rename internal command line flags for optimization level and mtriple used when passing options into the translate API call. --- llvm/lib/Target/SPIRV/SPIRVAPI.cpp | 13 +++++++------ llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp index 4c806fd7c98882..95c9b0e5200608 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp @@ -42,9 +42,10 @@ namespace { // Mimic limited number of command line flags from llc to provide a better // user experience when passing options into the translate API call. -static cl::opt SpvOptLevel(" O", cl::Hidden, cl::Prefix, cl::init('0')); -static cl::opt SpvTargetTriple(" mtriple", cl::Hidden, - cl::init("")); +static cl::opt SpirvOptLevel("spirv-O", cl::Hidden, cl::Prefix, + cl::init('0')); +static cl::opt SpirvTargetTriple("spirv-mtriple", cl::Hidden, + cl::init("")); // Utility to accept options in a command line style. void parseSPIRVCommandLineOptions(const std::vector &Options, @@ -94,7 +95,7 @@ SPIRVTranslateModule(Module *M, std::string &SpirvObj, std::string &ErrMsg, } llvm::CodeGenOptLevel OLevel; - if (auto Level = CodeGenOpt::parseLevel(SpvOptLevel)) { + if (auto Level = CodeGenOpt::parseLevel(SpirvOptLevel)) { OLevel = *Level; } else { ErrMsg = "Invalid optimization level!"; @@ -115,9 +116,9 @@ SPIRVTranslateModule(Module *M, std::string &SpirvObj, std::string &ErrMsg, // SPIR-V-specific target initialization. InitializeSPIRVTarget(); - Triple TargetTriple(SpvTargetTriple.empty() + Triple TargetTriple(SpirvTargetTriple.empty() ? M->getTargetTriple() - : Triple::normalize(SpvTargetTriple)); + : Triple::normalize(SpirvTargetTriple)); if (TargetTriple.getTriple().empty()) { TargetTriple.setTriple(DefaultTriple); M->setTargetTriple(DefaultTriple); diff --git a/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp b/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp index 149db48c190a09..f0b4a2f55c1519 100644 --- a/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp +++ b/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp @@ -80,7 +80,7 @@ TEST_F(SPIRVAPITest, checkTranslateOk) { // Those command line arguments that overlap with registered by llc/codegen // are to be started with the ' ' symbol. std::vector SetOfOpts[] = { - {}, {"- mtriple=spirv32-unknown-unknown"}}; + {}, {"--spirv-mtriple=spirv32-unknown-unknown"}}; for (const auto &Opts : SetOfOpts) { for (StringRef &Assembly : Assemblies) { std::string Result, Error; @@ -100,7 +100,7 @@ TEST_F(SPIRVAPITest, checkTranslateError) { EXPECT_THAT(Error, StartsWith("SPIRVTranslateModule: Unknown command line argument " "'-mtriple=spirv32-unknown-unknown'")); - Status = toSpirv(OkAssembly, Result, Error, {}, {"- O 5"}); + Status = toSpirv(OkAssembly, Result, Error, {}, {"--spirv-O 5"}); EXPECT_FALSE(Status); EXPECT_TRUE(Result.empty()); EXPECT_EQ(Error, "Invalid optimization level!"); From 68c6b2e18809342e3747d50eb0dc84246393941b Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Thu, 23 Jan 2025 06:28:43 +0800 Subject: [PATCH 044/208] [ASTMatchers][NFC] use `Matcher` instead of `DynTypedMatcher` in `TypeLocTypeMatcher` (#123450) There are no template in `TypeLocTypeMatcher`. So we do not need to use `DynTypedMatcher` which can improve performance --- clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp | 1 + clang/include/clang/ASTMatchers/ASTMatchersInternal.h | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp index 1ff61bae46b1ed..4448e9ccba80d9 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp @@ -10,6 +10,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" #include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Lex/Lexer.h" using namespace clang::ast_matchers; diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h index 1f7b5e7cac8465..55a925bf869091 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h +++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h @@ -1804,7 +1804,7 @@ class LocMatcher : public MatcherInterface { /// /// Used to implement the \c loc() matcher. class TypeLocTypeMatcher : public MatcherInterface { - DynTypedMatcher InnerMatcher; + Matcher InnerMatcher; public: explicit TypeLocTypeMatcher(const Matcher &InnerMatcher) @@ -1814,8 +1814,7 @@ class TypeLocTypeMatcher : public MatcherInterface { BoundNodesTreeBuilder *Builder) const override { if (!Node) return false; - return this->InnerMatcher.matches(DynTypedNode::create(Node.getType()), - Finder, Builder); + return this->InnerMatcher.matches(Node.getType(), Finder, Builder); } }; From a2c683b665e99831c5d6343a9afeeae2877b393a Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 22 Jan 2025 23:30:23 +0100 Subject: [PATCH 045/208] [LLD][COFF] Use EC symbol table for exports defined in module definition files (#123849) --- lld/COFF/Driver.cpp | 64 +------------------------------- lld/COFF/Driver.h | 2 - lld/COFF/SymbolTable.cpp | 63 +++++++++++++++++++++++++++++++ lld/COFF/SymbolTable.h | 1 + lld/test/COFF/arm64x-export.test | 12 ++++++ 5 files changed, 77 insertions(+), 65 deletions(-) diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 4e0678282eed01..3fde9c84d977db 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -30,7 +30,6 @@ #include "llvm/LTO/LTO.h" #include "llvm/Object/ArchiveWriter.h" #include "llvm/Object/COFFImportFile.h" -#include "llvm/Object/COFFModuleDefinition.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/Option.h" @@ -1012,67 +1011,6 @@ void LinkerDriver::createImportLibrary(bool asLib) { } } -void LinkerDriver::parseModuleDefs(StringRef path) { - llvm::TimeTraceScope timeScope("Parse def file"); - std::unique_ptr mb = - CHECK(MemoryBuffer::getFile(path, /*IsText=*/false, - /*RequiresNullTerminator=*/false, - /*IsVolatile=*/true), - "could not open " + path); - COFFModuleDefinition m = check(parseCOFFModuleDefinition( - mb->getMemBufferRef(), ctx.config.machine, ctx.config.mingw)); - - // Include in /reproduce: output if applicable. - ctx.driver.takeBuffer(std::move(mb)); - - if (ctx.config.outputFile.empty()) - ctx.config.outputFile = std::string(saver().save(m.OutputFile)); - ctx.config.importName = std::string(saver().save(m.ImportName)); - if (m.ImageBase) - ctx.config.imageBase = m.ImageBase; - if (m.StackReserve) - ctx.config.stackReserve = m.StackReserve; - if (m.StackCommit) - ctx.config.stackCommit = m.StackCommit; - if (m.HeapReserve) - ctx.config.heapReserve = m.HeapReserve; - if (m.HeapCommit) - ctx.config.heapCommit = m.HeapCommit; - if (m.MajorImageVersion) - ctx.config.majorImageVersion = m.MajorImageVersion; - if (m.MinorImageVersion) - ctx.config.minorImageVersion = m.MinorImageVersion; - if (m.MajorOSVersion) - ctx.config.majorOSVersion = m.MajorOSVersion; - if (m.MinorOSVersion) - ctx.config.minorOSVersion = m.MinorOSVersion; - - for (COFFShortExport e1 : m.Exports) { - Export e2; - // Renamed exports are parsed and set as "ExtName = Name". If Name has - // the form "OtherDll.Func", it shouldn't be a normal exported - // function but a forward to another DLL instead. This is supported - // by both MS and GNU linkers. - if (!e1.ExtName.empty() && e1.ExtName != e1.Name && - StringRef(e1.Name).contains('.')) { - e2.name = saver().save(e1.ExtName); - e2.forwardTo = saver().save(e1.Name); - } else { - e2.name = saver().save(e1.Name); - e2.extName = saver().save(e1.ExtName); - } - e2.exportAs = saver().save(e1.ExportAs); - e2.importName = saver().save(e1.ImportName); - e2.ordinal = e1.Ordinal; - e2.noname = e1.Noname; - e2.data = e1.Data; - e2.isPrivate = e1.Private; - e2.constant = e1.Constant; - e2.source = ExportSource::ModuleDefinition; - ctx.symtab.exports.push_back(e2); - } -} - void LinkerDriver::enqueueTask(std::function task) { taskQueue.push_back(std::move(task)); } @@ -2352,7 +2290,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Handle /def if (auto *arg = args.getLastArg(OPT_deffile)) { // parseModuleDefs mutates Config object. - parseModuleDefs(arg->getValue()); + mainSymtab.parseModuleDefs(arg->getValue()); } // Handle generation of import library from a def file. diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h index 12724cbd1eef49..58dc5458e9a544 100644 --- a/lld/COFF/Driver.h +++ b/lld/COFF/Driver.h @@ -143,8 +143,6 @@ class LinkerDriver { // Used by the resolver to parse .drectve section contents. void parseDirectives(InputFile *file); - void parseModuleDefs(StringRef path); - // Parse an /order file. If an option is given, the linker places COMDAT // sections int he same order as their names appear in the given file. void parseOrderFile(StringRef arg); diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index ecccc7d6ed70c7..32ea4a5b2e1fc3 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Mangler.h" #include "llvm/LTO/LTO.h" +#include "llvm/Object/COFFModuleDefinition.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GlobPattern.h" #include "llvm/Support/Parallel.h" @@ -29,6 +30,7 @@ using namespace llvm; using namespace llvm::COFF; +using namespace llvm::object; using namespace llvm::support; namespace lld::coff { @@ -1253,6 +1255,67 @@ void SymbolTable::assignExportOrdinals() { << Twine(std::numeric_limits::max()) << ")"; } +void SymbolTable::parseModuleDefs(StringRef path) { + llvm::TimeTraceScope timeScope("Parse def file"); + std::unique_ptr mb = + CHECK(MemoryBuffer::getFile(path, /*IsText=*/false, + /*RequiresNullTerminator=*/false, + /*IsVolatile=*/true), + "could not open " + path); + COFFModuleDefinition m = check(parseCOFFModuleDefinition( + mb->getMemBufferRef(), machine, ctx.config.mingw)); + + // Include in /reproduce: output if applicable. + ctx.driver.takeBuffer(std::move(mb)); + + if (ctx.config.outputFile.empty()) + ctx.config.outputFile = std::string(saver().save(m.OutputFile)); + ctx.config.importName = std::string(saver().save(m.ImportName)); + if (m.ImageBase) + ctx.config.imageBase = m.ImageBase; + if (m.StackReserve) + ctx.config.stackReserve = m.StackReserve; + if (m.StackCommit) + ctx.config.stackCommit = m.StackCommit; + if (m.HeapReserve) + ctx.config.heapReserve = m.HeapReserve; + if (m.HeapCommit) + ctx.config.heapCommit = m.HeapCommit; + if (m.MajorImageVersion) + ctx.config.majorImageVersion = m.MajorImageVersion; + if (m.MinorImageVersion) + ctx.config.minorImageVersion = m.MinorImageVersion; + if (m.MajorOSVersion) + ctx.config.majorOSVersion = m.MajorOSVersion; + if (m.MinorOSVersion) + ctx.config.minorOSVersion = m.MinorOSVersion; + + for (COFFShortExport e1 : m.Exports) { + Export e2; + // Renamed exports are parsed and set as "ExtName = Name". If Name has + // the form "OtherDll.Func", it shouldn't be a normal exported + // function but a forward to another DLL instead. This is supported + // by both MS and GNU linkers. + if (!e1.ExtName.empty() && e1.ExtName != e1.Name && + StringRef(e1.Name).contains('.')) { + e2.name = saver().save(e1.ExtName); + e2.forwardTo = saver().save(e1.Name); + } else { + e2.name = saver().save(e1.Name); + e2.extName = saver().save(e1.ExtName); + } + e2.exportAs = saver().save(e1.ExportAs); + e2.importName = saver().save(e1.ImportName); + e2.ordinal = e1.Ordinal; + e2.noname = e1.Noname; + e2.data = e1.Data; + e2.isPrivate = e1.Private; + e2.constant = e1.Constant; + e2.source = ExportSource::ModuleDefinition; + exports.push_back(e2); + } +} + Symbol *SymbolTable::addUndefined(StringRef name) { return addUndefined(name, nullptr, false); } diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index e5b02ce5904c49..c8d7251838842f 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -160,6 +160,7 @@ class SymbolTable { void fixupExports(); void assignExportOrdinals(); + void parseModuleDefs(StringRef path); // Iterates symbols in non-determinstic hash table order. template void forEachSymbol(T callback) { diff --git a/lld/test/COFF/arm64x-export.test b/lld/test/COFF/arm64x-export.test index 526be633973581..a78b291cedbe10 100644 --- a/lld/test/COFF/arm64x-export.test +++ b/lld/test/COFF/arm64x-export.test @@ -55,6 +55,13 @@ RUN: loadconfig-arm64.obj loadconfig-arm64ec.obj arm64ec-drectve.obj -n RUN: llvm-objdump -d out-drectve-ec.dll | FileCheck --check-prefix=DISASM-EC %s RUN: llvm-readobj --headers --coff-exports out-drectve-ec.dll | FileCheck --check-prefix=EXPORTS-EC %s +# A command-line def file applies only to EC exports. + +RUN: lld-link -machine:arm64x -dll -out:out-def-ec.dll arm64ec-func.obj arm64-func.obj \ +RUN: loadconfig-arm64.obj loadconfig-arm64ec.obj -def:func.def -noentry +RUN: llvm-objdump -d out-def-ec.dll | FileCheck --check-prefix=DISASM-EC %s +RUN: llvm-readobj --headers --coff-exports out-def-ec.dll | FileCheck --check-prefix=EXPORTS-EC %s + # Export using the EC .edata section. RUN: lld-link -machine:arm64x -dll -out:out-edata-ec.dll arm64ec-func.obj arm64-func.obj \ @@ -227,3 +234,8 @@ funcname_func: name: .asciz "out-edata.dll" + +#--- func.def +LIBRARY out.dll +EXPORTS + func From 4e9d5a3a307b362529583969e59070bc17909f23 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 22 Jan 2025 23:32:18 +0100 Subject: [PATCH 046/208] [LLD][COFF] Add support for the -defArm64Native argument (#123850) MSVC ignores the `/defArm64Native` argument on non-ARM64X targets. It is also ignored if the `/def` option is not specified. --- lld/COFF/Driver.cpp | 9 +++++++++ lld/COFF/Options.td | 3 +++ lld/test/COFF/arm64x-export.test | 23 +++++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 3fde9c84d977db..6eea11f5f451fd 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1795,6 +1795,9 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { setMachine(machine); } } + + // Most of main arguments apply either to both or only to EC symbol table on + // ARM64X target. SymbolTable &mainSymtab = ctx.hybridSymtab ? *ctx.hybridSymtab : ctx.symtab; // Handle /nodefaultlib: @@ -2291,6 +2294,12 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (auto *arg = args.getLastArg(OPT_deffile)) { // parseModuleDefs mutates Config object. mainSymtab.parseModuleDefs(arg->getValue()); + if (ctx.hybridSymtab) { + // MSVC ignores the /defArm64Native argument on non-ARM64X targets. + // It is also ignored if the /def option is not specified. + if (auto *arg = args.getLastArg(OPT_defarm64native)) + ctx.symtab.parseModuleDefs(arg->getValue()); + } } // Handle generation of import library from a def file. diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index c7ceb51f70b70a..b6fd3d0daaef99 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -140,6 +140,9 @@ def incl : Joined<["/", "-", "/?", "-?"], "include:">, def deffile : Joined<["/", "-", "/?", "-?"], "def:">, HelpText<"Use module-definition file">; +def defarm64native + : P<"defarm64native", + "Use a module-definition file for the native view in a hybrid image.">; def debug : F<"debug">, HelpText<"Embed a symbol table in the image">; def debug_opt : P<"debug", "Embed a symbol table in the image with option">; def debugtype : P<"debugtype", "Debug Info Options">; diff --git a/lld/test/COFF/arm64x-export.test b/lld/test/COFF/arm64x-export.test index a78b291cedbe10..3ae0725a67089f 100644 --- a/lld/test/COFF/arm64x-export.test +++ b/lld/test/COFF/arm64x-export.test @@ -162,6 +162,29 @@ EXPORTS-BOTH-NEXT: RVA: 0x3000 EXPORTS-BOTH-NEXT: } EXPORTS-BOTH-NEXT: } +# Export using both the -def and -defarm64native arguments. + +RUN: lld-link -machine:arm64x -dll -out:out-def-both.dll arm64ec-func.obj arm64-func.obj \ +RUN: loadconfig-arm64.obj loadconfig-arm64ec.obj -def:func.def -defarm64native:func.def -noentry +RUN: llvm-objdump -d out-def-both.dll | FileCheck --check-prefix=DISASM-BOTH %s +RUN: llvm-readobj --headers --coff-exports out-def-both.dll | FileCheck --check-prefix=EXPORTS-BOTH %s + +# -defarm64native is ignored if -def is not specified. + +RUN: lld-link -machine:arm64x -dll -out:out-def-native.dll arm64ec-func.obj arm64-func.obj \ +RUN: loadconfig-arm64.obj loadconfig-arm64ec.obj -defarm64native:func.def -noentry +RUN: llvm-readobj --headers --coff-exports out-def-native.dll | FileCheck --check-prefix=NO-EXPORT %s +NO-EXPORT: ExportTableRVA: 0x0 +NO-EXPORT: ExportTableSize: 0x0 +NO-EXPORT: HybridObject { +NO-EXPORT: ExportTableRVA: 0x0 +NO-EXPORT: ExportTableSize: 0x0 +NO-EXPORT: } + +# -defarm64native is ignored on ARM64 target. + +RUN: lld-link -machine:arm64 -dll -out:out-arm64-def.dll arm64-func.obj -defarm64native:invalid.def -def:func.def -noentry 2>&1 | count 0 + # Export using both the native and EC .edata sections. RUN: lld-link -machine:arm64x -dll -out:out-edata-both.dll arm64ec-func.obj arm64-func.obj \ From a77250fd782530f42a90f8562bcef0eb26abb010 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Wed, 22 Jan 2025 14:33:19 -0800 Subject: [PATCH 047/208] [mlir] Add C and Python interface for file range (#123276) Plumbs through creating file ranges to C and Python. --- mlir/include/mlir-c/IR.h | 5 +++++ mlir/lib/Bindings/Python/IRCore.cpp | 15 +++++++++++++++ mlir/lib/CAPI/IR/IR.cpp | 9 +++++++++ mlir/test/CAPI/ir.c | 7 +++++++ mlir/test/python/ir/location.py | 5 +++++ 5 files changed, 41 insertions(+) diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h index 0a515bbea3b504..7d2fd89e8560fc 100644 --- a/mlir/include/mlir-c/IR.h +++ b/mlir/include/mlir-c/IR.h @@ -256,6 +256,11 @@ mlirLocationFromAttribute(MlirAttribute attribute); MLIR_CAPI_EXPORTED MlirLocation mlirLocationFileLineColGet( MlirContext context, MlirStringRef filename, unsigned line, unsigned col); +/// Creates an File/Line/Column range location owned by the given context. +MLIR_CAPI_EXPORTED MlirLocation mlirLocationFileLineColRangeGet( + MlirContext context, MlirStringRef filename, unsigned start_line, + unsigned start_col, unsigned end_line, unsigned end_col); + /// Creates a call site location with a callee and a caller. MLIR_CAPI_EXPORTED MlirLocation mlirLocationCallSiteGet(MlirLocation callee, MlirLocation caller); diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index c862ec84fcbc55..738f1444b15fe5 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -50,6 +50,9 @@ static const char kContextGetCallSiteLocationDocstring[] = static const char kContextGetFileLocationDocstring[] = R"(Gets a Location representing a file, line and column)"; +static const char kContextGetFileRangeDocstring[] = + R"(Gets a Location representing a file, line and column range)"; + static const char kContextGetFusedLocationDocstring[] = R"(Gets a Location representing a fused location with optional metadata)"; @@ -2917,6 +2920,18 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("filename"), nb::arg("line"), nb::arg("col"), nb::arg("context").none() = nb::none(), kContextGetFileLocationDocstring) + .def_static( + "file", + [](std::string filename, int startLine, int startCol, int endLine, + int endCol, DefaultingPyMlirContext context) { + return PyLocation(context->getRef(), + mlirLocationFileLineColRangeGet( + context->get(), toMlirStringRef(filename), + startLine, startCol, endLine, endCol)); + }, + nb::arg("filename"), nb::arg("start_line"), nb::arg("start_col"), + nb::arg("end_line"), nb::arg("end_col"), + nb::arg("context").none() = nb::none(), kContextGetFileRangeDocstring) .def_static( "fused", [](const std::vector &pyLocations, diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index 24dc8854048532..f27af0ca9a2c78 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -264,6 +264,15 @@ MlirLocation mlirLocationFileLineColGet(MlirContext context, FileLineColLoc::get(unwrap(context), unwrap(filename), line, col))); } +MlirLocation +mlirLocationFileLineColRangeGet(MlirContext context, MlirStringRef filename, + unsigned startLine, unsigned startCol, + unsigned endLine, unsigned endCol) { + return wrap( + Location(FileLineColRange::get(unwrap(context), unwrap(filename), + startLine, startCol, endLine, endCol))); +} + MlirLocation mlirLocationCallSiteGet(MlirLocation callee, MlirLocation caller) { return wrap(Location(CallSiteLoc::get(unwrap(callee), unwrap(caller)))); } diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c index 15a3a1fb50dc9e..68da79f69cc0ad 100644 --- a/mlir/test/CAPI/ir.c +++ b/mlir/test/CAPI/ir.c @@ -2389,6 +2389,9 @@ void testDiagnostics(void) { MlirLocation fileLineColLoc = mlirLocationFileLineColGet( ctx, mlirStringRefCreateFromCString("file.c"), 1, 2); mlirEmitError(fileLineColLoc, "test diagnostics"); + MlirLocation fileLineColRange = mlirLocationFileLineColRangeGet( + ctx, mlirStringRefCreateFromCString("other-file.c"), 1, 2, 3, 4); + mlirEmitError(fileLineColRange, "test diagnostics"); MlirLocation callSiteLoc = mlirLocationCallSiteGet( mlirLocationFileLineColGet( ctx, mlirStringRefCreateFromCString("other-file.c"), 2, 3), @@ -2418,6 +2421,10 @@ void testDiagnostics(void) { // CHECK: >> end of diagnostic (userData: 42) // CHECK: processing diagnostic (userData: 42) << // CHECK: test diagnostics + // CHECK: loc("other-file.c":1:2 to 3:4) + // CHECK: >> end of diagnostic (userData: 42) + // CHECK: processing diagnostic (userData: 42) << + // CHECK: test diagnostics // CHECK: loc(callsite("other-file.c":2:3 at "file.c":1:2)) // CHECK: >> end of diagnostic (userData: 42) // CHECK: processing diagnostic (userData: 42) << diff --git a/mlir/test/python/ir/location.py b/mlir/test/python/ir/location.py index f66d6c501dcf5c..59d8a89e770dd9 100644 --- a/mlir/test/python/ir/location.py +++ b/mlir/test/python/ir/location.py @@ -47,12 +47,17 @@ def testLocationAttr(): def testFileLineCol(): with Context() as ctx: loc = Location.file("foo.txt", 123, 56) + range = Location.file("foo.txt", 123, 56, 123, 100) ctx = None gc.collect() # CHECK: file str: loc("foo.txt":123:56) print("file str:", str(loc)) # CHECK: file repr: loc("foo.txt":123:56) print("file repr:", repr(loc)) + # CHECK: file range str: loc("foo.txt":123:56 to :100) + print("file range str:", str(range)) + # CHECK: file range repr: loc("foo.txt":123:56 to :100) + print("file range repr:", repr(range)) run(testFileLineCol) From 223bd0ca81f871beb31d40b4f02753493c30b5b2 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 22 Jan 2025 23:33:44 +0100 Subject: [PATCH 048/208] [libc++] Avoid unnecessary instantiations for __copy_cvref_t (#123718) This changes the implementation of `__copy_cvref_t` to only template the implementation class on the `_From` parameter, avoiding instantiations for every combination of `_From` and `_To`. --- libcxx/include/__type_traits/copy_cvref.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/libcxx/include/__type_traits/copy_cvref.h b/libcxx/include/__type_traits/copy_cvref.h index 511d4e0776d609..158e5a5d78bb35 100644 --- a/libcxx/include/__type_traits/copy_cvref.h +++ b/libcxx/include/__type_traits/copy_cvref.h @@ -20,23 +20,26 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template +template struct __copy_cvref { - using type = __copy_cv_t<_From, _To>; + template + using __apply _LIBCPP_NODEBUG = __copy_cv_t<_From, _To>; }; -template -struct __copy_cvref<_From&, _To> { - using type = __add_lvalue_reference_t<__copy_cv_t<_From, _To> >; +template +struct __copy_cvref<_From&> { + template + using __apply _LIBCPP_NODEBUG = __add_lvalue_reference_t<__copy_cv_t<_From, _To> >; }; -template -struct __copy_cvref<_From&&, _To> { - using type = __add_rvalue_reference_t<__copy_cv_t<_From, _To> >; +template +struct __copy_cvref<_From&&> { + template + using __apply _LIBCPP_NODEBUG = __add_rvalue_reference_t<__copy_cv_t<_From, _To> >; }; template -using __copy_cvref_t _LIBCPP_NODEBUG = typename __copy_cvref<_From, _To>::type; +using __copy_cvref_t _LIBCPP_NODEBUG = typename __copy_cvref<_From>::template __apply<_To>; _LIBCPP_END_NAMESPACE_STD From 27ccc99c4f4300115aa0d619e8e4693f18b2af9d Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 22 Jan 2025 14:42:52 -0800 Subject: [PATCH 049/208] [RISCV][VLOpt] Minor worklist invariant cleanup [NFC] (#123989) In retrospect, this probably should have been rolled into #123973. It seemed more involved when I first decided to split. :) --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index fc3300247b1909..63f4ab4d572d59 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -1189,6 +1189,10 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return false; } + assert(MI.getOperand(0).isReg() && + isVectorRegClass(MI.getOperand(0).getReg(), MRI) && + "All supported instructions produce a vector register result"); + LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n"); return true; } @@ -1295,9 +1299,6 @@ std::optional RISCVVLOptimizer::checkUsers(MachineInstr &MI) { bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) { LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n"); - if (!isVectorRegClass(MI.getOperand(0).getReg(), MRI)) - return false; - auto CommonVL = checkUsers(MI); if (!CommonVL) return false; @@ -1353,14 +1354,11 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { auto PushOperands = [this, &Worklist](MachineInstr &MI, bool IgnoreSameBlock) { for (auto &Op : MI.operands()) { - if (!Op.isReg() || !Op.isUse() || !Op.getReg().isVirtual()) - continue; - - if (!isVectorRegClass(Op.getReg(), MRI)) + if (!Op.isReg() || !Op.isUse() || !Op.getReg().isVirtual() || + !isVectorRegClass(Op.getReg(), MRI)) continue; MachineInstr *DefMI = MRI->getVRegDef(Op.getReg()); - if (!isCandidate(*DefMI)) continue; @@ -1394,6 +1392,7 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { while (!Worklist.empty()) { assert(MadeChange); MachineInstr &MI = *Worklist.pop_back_val(); + assert(isCandidate(MI)); if (!tryReduceVL(MI)) continue; PushOperands(MI, /*IgnoreSameBlock*/ false); From e0ae8890460d6c6a90dd27fe6762128b2161000b Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 22 Jan 2025 14:43:22 -0800 Subject: [PATCH 050/208] [SandboxIR][Doc] Add Quick start notes (#123992) --- llvm/docs/SandboxIR.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/llvm/docs/SandboxIR.md b/llvm/docs/SandboxIR.md index 3b792659bb59ba..61bae4e36ef435 100644 --- a/llvm/docs/SandboxIR.md +++ b/llvm/docs/SandboxIR.md @@ -2,6 +2,41 @@ Sandbox IR is an IR layer on top of LLVM IR that allows you to save/restore its state. +# Quick Start Notes + +Within your LLVM pass: + +``` +// 1. Include the necessary Sandbox IR header files. +#include "llvm/SandboxIR/Context.h +#include "llvm/SandboxIR/Function.h + +// 2. Create a sandboxir::Context using LLVMContext `LLVMCtx`. +sandboxir::Context Ctx(LLVMCtx); + +// 3. Create a sandboxir::Function using LLVM IR Function `LLVMF`. +auto *F = Ctx.createFunction(LLVMF); + +// ... Use Sandbox IR in `F` as usual, e.g., iterating, modifying it etc. ... + +// 4. Save state when needed. +Ctx.save(); + +// ... Modify Sandbox IR ... + +// 5. Revert to the saved state. +Ctx.revert(); +``` + +Make sure you link against `SandboxIR` in `CMakeLists.txt`: + +``` +LINK_COMPONENTS +... +SandboxIR +... +``` + # API The Sandbox IR API is designed to feel like LLVM, replicating many common API classes and functions to mirror the LLVM API. The class hierarchy is similar (but in the `llvm::sandboxir` namespace). From 630177ccdde44b0dd8faa13b34002d15c4b0af8d Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Wed, 22 Jan 2025 23:03:48 +0000 Subject: [PATCH 051/208] [compiler-rt][rtsan] Fix madvise/posix_madvise for macOs. (#124020) only bsd and linux intercept these syscalls. Fix #123601 --- compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp | 11 +++++++++-- .../lib/rtsan/tests/rtsan_test_interceptors_posix.cpp | 6 +++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 71938d3edba38d..a9812f90dec079 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -808,6 +808,7 @@ INTERCEPTOR(int, munmap, void *addr, size_t length) { return REAL(munmap)(addr, length); } +#if !SANITIZER_APPLE INTERCEPTOR(int, madvise, void *addr, size_t length, int flag) { __rtsan_notify_intercepted_call("madvise"); return REAL(madvise)(addr, length, flag); @@ -817,6 +818,12 @@ INTERCEPTOR(int, posix_madvise, void *addr, size_t length, int flag) { __rtsan_notify_intercepted_call("posix_madvise"); return REAL(posix_madvise)(addr, length, flag); } +#define RTSAN_MAYBE_INTERCEPT_MADVISE INTERCEPT_FUNCTION(madvise) +#define RTSAN_MAYBE_INTERCEPT_POSIX_MADVISE INTERCEPT_FUNCTION(posix_madvise) +#else +#define RTSAN_MAYBE_INTERCEPT_MADVISE +#define RTSAN_MAYBE_INTERCEPT_POSIX_MADVISE +#endif INTERCEPTOR(int, mprotect, void *addr, size_t length, int prot) { __rtsan_notify_intercepted_call("mprotect"); @@ -1216,8 +1223,8 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(mmap); RTSAN_MAYBE_INTERCEPT_MMAP64; INTERCEPT_FUNCTION(munmap); - INTERCEPT_FUNCTION(madvise); - INTERCEPT_FUNCTION(posix_madvise); + RTSAN_MAYBE_INTERCEPT_MADVISE; + RTSAN_MAYBE_INTERCEPT_POSIX_MADVISE; INTERCEPT_FUNCTION(mprotect); INTERCEPT_FUNCTION(msync); INTERCEPT_FUNCTION(mincore); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 0a59ae0ea92548..23b1728c7a4594 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -238,6 +238,7 @@ class RtsanOpenedMmapTest : public RtsanFileTest { int fd = -1; }; +#if !SANITIZER_APPLE TEST_F(RtsanOpenedMmapTest, MadviseDiesWhenRealtime) { auto Func = [this]() { madvise(GetAddr(), GetSize(), MADV_NORMAL); }; ExpectRealtimeDeath(Func, "madvise"); @@ -245,10 +246,13 @@ TEST_F(RtsanOpenedMmapTest, MadviseDiesWhenRealtime) { } TEST_F(RtsanOpenedMmapTest, PosixMadviseDiesWhenRealtime) { - auto Func = [this]() { posix_madvise(GetAddr(), GetSize(), MADV_NORMAL); }; + auto Func = [this]() { + posix_madvise(GetAddr(), GetSize(), POSIX_MADV_NORMAL); + }; ExpectRealtimeDeath(Func, "posix_madvise"); ExpectNonRealtimeSurvival(Func); } +#endif TEST_F(RtsanOpenedMmapTest, MprotectDiesWhenRealtime) { auto Func = [this]() { mprotect(GetAddr(), GetSize(), PROT_READ); }; From 939f2900d03c6ab0a89ba619ff25c8542bd11a5a Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Wed, 22 Jan 2025 23:23:32 +0000 Subject: [PATCH 052/208] [compiler-rt][rtsan] getsockopt/setsockopt interception. (#124004) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 21 ++++++++++++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 22 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index a9812f90dec079..a01354781272d5 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -1002,6 +1002,25 @@ INTERCEPTOR(int, accept4, int socket, struct sockaddr *address, #define RTSAN_MAYBE_INTERCEPT_ACCEPT4 #endif +#if SANITIZER_INTERCEPT_GETSOCKOPT +INTERCEPTOR(int, getsockopt, int socket, int level, int option, void *value, + socklen_t *len) { + __rtsan_notify_intercepted_call("getsockopt"); + return REAL(getsockopt)(socket, level, option, value, len); +} + +INTERCEPTOR(int, setsockopt, int socket, int level, int option, + const void *value, socklen_t len) { + __rtsan_notify_intercepted_call("setsockopt"); + return REAL(setsockopt)(socket, level, option, value, len); +} +#define RTSAN_MAYBE_INTERCEPT_GETSOCKOPT INTERCEPT_FUNCTION(getsockopt) +#define RTSAN_MAYBE_INTERCEPT_SETSOCKOPT INTERCEPT_FUNCTION(setsockopt) +#else +#define RTSAN_MAYBE_INTERCEPT_GETSOCKOPT +#define RTSAN_MAYBE_INTERCEPT_SETSOCKOPT +#endif + // I/O Multiplexing INTERCEPTOR(int, poll, struct pollfd *fds, nfds_t nfds, int timeout) { @@ -1332,6 +1351,8 @@ void __rtsan::InitializeInterceptors() { RTSAN_MAYBE_INTERCEPT_ACCEPT4; RTSAN_MAYBE_INTERCEPT_GETSOCKNAME; RTSAN_MAYBE_INTERCEPT_GETPEERNAME; + RTSAN_MAYBE_INTERCEPT_GETSOCKOPT; + RTSAN_MAYBE_INTERCEPT_SETSOCKOPT; RTSAN_MAYBE_INTERCEPT_SELECT; INTERCEPT_FUNCTION(pselect); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 23b1728c7a4594..981766c85f965e 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -1286,6 +1286,28 @@ TEST(TestRtsanInterceptors, GetpeernameOnASocketDiesWhenRealtime) { } #endif +#if SANITIZER_INTERCEPT_GETSOCKOPT +TEST(TestRtsanInterceptors, GetsockoptOnASocketDiesWhenRealtime) { + int val = 0; + socklen_t len = static_cast(sizeof(val)); + auto Func = [&val, &len]() { + getsockopt(0, SOL_SOCKET, SO_REUSEADDR, &val, &len); + }; + ExpectRealtimeDeath(Func, "getsockopt"); + ExpectNonRealtimeSurvival(Func); +} + +TEST(TestRtsanInterceptors, SetsockoptOnASocketDiesWhenRealtime) { + int val = 0; + socklen_t len = static_cast(sizeof(val)); + auto Func = [&val, &len]() { + setsockopt(0, SOL_SOCKET, SO_REUSEADDR, &val, len); + }; + ExpectRealtimeDeath(Func, "setsockopt"); + ExpectNonRealtimeSurvival(Func); +} +#endif + /* I/O Multiplexing */ From fd087135efe1b62b506c3caef3fef83242a8e504 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 22 Jan 2025 15:23:47 -0800 Subject: [PATCH 053/208] [SandboxVec][Legality] Diamond reuse multi input (#123426) This patch implements the diamond pattern where we are vectorizing toward the top of the diamond from both edges, but the second edge may use elements from a different vector or just scalar values. This requires some additional packing code (see lit test). --- .../Vectorize/SandboxVectorizer/Legality.h | 22 ++++++++++-- .../Vectorize/SandboxVectorizer/Legality.cpp | 3 +- .../SandboxVectorizer/Passes/BottomUpVec.cpp | 34 +++++++++++++++++++ .../SandboxVectorizer/bottomup_basic.ll | 27 +++++++++++++++ 4 files changed, 83 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h index 4858ebaf0770aa..f10c535aa820ee 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h @@ -81,6 +81,7 @@ enum class LegalityResultID { Widen, ///> Vectorize by combining scalars to a vector. DiamondReuse, ///> Don't generate new code, reuse existing vector. DiamondReuseWithShuffle, ///> Reuse the existing vector but add a shuffle. + DiamondReuseMultiInput, ///> Reuse more than one vector and/or scalars. }; /// The reason for vectorizing or not vectorizing. @@ -108,6 +109,8 @@ struct ToStr { return "DiamondReuse"; case LegalityResultID::DiamondReuseWithShuffle: return "DiamondReuseWithShuffle"; + case LegalityResultID::DiamondReuseMultiInput: + return "DiamondReuseMultiInput"; } llvm_unreachable("Unknown LegalityResultID enum"); } @@ -287,6 +290,20 @@ class CollectDescr { } }; +class DiamondReuseMultiInput final : public LegalityResult { + friend class LegalityAnalysis; + CollectDescr Descr; + DiamondReuseMultiInput(CollectDescr &&Descr) + : LegalityResult(LegalityResultID::DiamondReuseMultiInput), + Descr(std::move(Descr)) {} + +public: + static bool classof(const LegalityResult *From) { + return From->getSubclassID() == LegalityResultID::DiamondReuseMultiInput; + } + const CollectDescr &getCollectDescr() const { return Descr; } +}; + /// Performs the legality analysis and returns a LegalityResult object. class LegalityAnalysis { Scheduler Sched; @@ -312,8 +329,9 @@ class LegalityAnalysis { : Sched(AA, Ctx), SE(SE), DL(DL), IMaps(IMaps) {} /// A LegalityResult factory. template - ResultT &createLegalityResult(ArgsT... Args) { - ResultPool.push_back(std::unique_ptr(new ResultT(Args...))); + ResultT &createLegalityResult(ArgsT &&...Args) { + ResultPool.push_back( + std::unique_ptr(new ResultT(std::move(Args)...))); return cast(*ResultPool.back()); } /// Checks if it's legal to vectorize the instructions in \p Bndl. diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp index ad3e38e2f1d923..085f4cd67ab76e 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp @@ -223,7 +223,8 @@ const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef Bndl, return createLegalityResult(Vec); return createLegalityResult(Vec, Mask); } - llvm_unreachable("TODO: Unimplemented"); + return createLegalityResult( + std::move(CollectDescrs)); } if (auto ReasonOpt = notVectorizableBasedOnOpcodesAndTypes(Bndl)) diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index d62023ea018846..c6ab3c1942c330 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -308,6 +308,40 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { NewVec = createShuffle(VecOp, Mask); break; } + case LegalityResultID::DiamondReuseMultiInput: { + const auto &Descr = + cast(LegalityRes).getCollectDescr(); + Type *ResTy = FixedVectorType::get(Bndl[0]->getType(), Bndl.size()); + + // TODO: Try to get WhereIt without creating a vector. + SmallVector DescrInstrs; + for (const auto &ElmDescr : Descr.getDescrs()) { + if (auto *I = dyn_cast(ElmDescr.getValue())) + DescrInstrs.push_back(I); + } + auto WhereIt = getInsertPointAfterInstrs(DescrInstrs); + + Value *LastV = PoisonValue::get(ResTy); + for (auto [Lane, ElmDescr] : enumerate(Descr.getDescrs())) { + Value *VecOp = ElmDescr.getValue(); + Context &Ctx = VecOp->getContext(); + Value *ValueToInsert; + if (ElmDescr.needsExtract()) { + ConstantInt *IdxC = + ConstantInt::get(Type::getInt32Ty(Ctx), ElmDescr.getExtractIdx()); + ValueToInsert = ExtractElementInst::create(VecOp, IdxC, WhereIt, + VecOp->getContext(), "VExt"); + } else { + ValueToInsert = VecOp; + } + ConstantInt *LaneC = ConstantInt::get(Type::getInt32Ty(Ctx), Lane); + Value *Ins = InsertElementInst::create(LastV, ValueToInsert, LaneC, + WhereIt, Ctx, "VIns"); + LastV = Ins; + } + NewVec = LastV; + break; + } case LegalityResultID::Pack: { // If we can't vectorize the seeds then just return. if (Depth == 0) diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll index a3798af8399087..5b389e25d70d95 100644 --- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll +++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll @@ -242,3 +242,30 @@ define void @diamondWithShuffle(ptr %ptr) { store float %sub1, ptr %ptr1 ret void } + +define void @diamondMultiInput(ptr %ptr, ptr %ptrX) { +; CHECK-LABEL: define void @diamondMultiInput( +; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTRX:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 +; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[LDX:%.*]] = load float, ptr [[PTRX]], align 4 +; CHECK-NEXT: [[VINS:%.*]] = insertelement <2 x float> poison, float [[LDX]], i32 0 +; CHECK-NEXT: [[VEXT:%.*]] = extractelement <2 x float> [[VECL]], i32 0 +; CHECK-NEXT: [[VINS1:%.*]] = insertelement <2 x float> [[VINS]], float [[VEXT]], i32 1 +; CHECK-NEXT: [[VEC:%.*]] = fsub <2 x float> [[VECL]], [[VINS1]] +; CHECK-NEXT: store <2 x float> [[VEC]], ptr [[PTR0]], align 4 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr float, ptr %ptr, i32 0 + %ptr1 = getelementptr float, ptr %ptr, i32 1 + %ld0 = load float, ptr %ptr0 + %ld1 = load float, ptr %ptr1 + + %ldX = load float, ptr %ptrX + + %sub0 = fsub float %ld0, %ldX + %sub1 = fsub float %ld1, %ld0 + store float %sub0, ptr %ptr0 + store float %sub1, ptr %ptr1 + ret void +} From 64360899c76cb2e687ef1fcea617ef455e8a2621 Mon Sep 17 00:00:00 2001 From: Yeoul Na Date: Wed, 22 Jan 2025 15:41:59 -0800 Subject: [PATCH 054/208] [BoundsSafety][Doc] Add BoundsSafetyAdoptionGuide.rst (#120674) This adds an instruction to adopt `-fbounds-safety` using the preview implementation available in the fork of llvm-project. --- clang/docs/BoundsSafety.rst | 9 ++- clang/docs/BoundsSafetyAdoptionGuide.rst | 90 ++++++++++++++++++++++++ clang/docs/index.rst | 1 + 3 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 clang/docs/BoundsSafetyAdoptionGuide.rst diff --git a/clang/docs/BoundsSafety.rst b/clang/docs/BoundsSafety.rst index 8fd655663edb00..e24c69d8c7855f 100644 --- a/clang/docs/BoundsSafety.rst +++ b/clang/docs/BoundsSafety.rst @@ -996,4 +996,11 @@ and the soundness of the type system. This may incur significant code size overhead in unoptimized builds and leaving some of the adoption mistakes to be caught only at run time. This is not a fundamental limitation, however, because incrementally adding necessary static analysis will allow us to catch issues -early on and remove unnecessary bounds checks in unoptimized builds. \ No newline at end of file +early on and remove unnecessary bounds checks in unoptimized builds. + +Try it out +========== + +Your feedback on the programming model is valuable. You may want to follow the +instruction in :doc:`BoundsSafetyAdoptionGuide` to play with ``-fbounds-safety`` +and please send your feedback to `Yeoul Na `_. \ No newline at end of file diff --git a/clang/docs/BoundsSafetyAdoptionGuide.rst b/clang/docs/BoundsSafetyAdoptionGuide.rst new file mode 100644 index 00000000000000..9998ce58e67c1a --- /dev/null +++ b/clang/docs/BoundsSafetyAdoptionGuide.rst @@ -0,0 +1,90 @@ +====================================== +Adoption Guide for ``-fbounds-safety`` +====================================== + +.. contents:: + :local: + +Where to get ``-fbounds-safety`` +================================ + +The open sourcing to llvm.org's ``llvm-project`` is still on going and the +feature is not available yet. In the mean time, the preview implementation is +available +`here `_ in a +fork of ``llvm-project``. Please follow +`Building LLVM with CMake `_ to build the +compiler. + +Feature flag +============ + +Pass ``-fbounds-safety`` as a Clang compilation flag for the C file that you +want to adopt. We recommend adopting the model file by file, because adoption +requires some effort to add bounds annotations and fix compiler diagnostics. + +Include ``ptrcheck.h`` +====================== + +``ptrcheck.h`` is a Clang toolchain header to provide definition of the bounds +annotations such as ``__counted_by``, ``__counted_by_or_null``, ``__sized_by``, +etc. In the LLVM source tree, the header is located in +``llvm-project/clang/lib/Headers/ptrcheck.h``. + + +Add bounds annotations on pointers as necessary +=============================================== + +Annotate pointers on struct fields and function parameters if they are pointing +to an array of object, with appropriate bounds annotations. Please see +:doc:`BoundsSafety` to learn what kind of bounds annotations are available and +their semantics. Note that local pointer variables typically don't need bounds +annotations because they are implicitely a wide pointer (``__bidi_indexable``) +that automatically carries the bounds information. + +Address compiler diagnostics +============================ + +Once you pass ``-fbounds-safety`` to compiler a C file, you will see some new +compiler warnings and errors, which guide adoption of ``-fbounds-safety``. +Consider the following example: + +.. code-block:: c + + #include + + void init_buf(int *p, int n) { + for (int i = 0; i < n; ++i) + p[i] = 0; // error: array subscript on single pointer 'p' must use a constant index of 0 to be in bounds + } + +The parameter ``int *p`` doesn't have a bounds annotation, so the compiler will +complain about the code indexing into it (``p[i]``) as it assumes that ``p`` is +pointing to a single ``int`` object or null. To address the diagnostics, you +should add a bounds annotation on ``int *p`` so that the compiler can reason +about the safety of the array subscript. In the following example, ``p`` is now +``int *__counted_by(n)``, so the compiler will allow the array subscript with +additional run-time checks as necessary. + +.. code-block:: c + + #include + + void init_buf(int *__counted_by(n) p, int n) { + for (int i = 0; i < n; ++i) + p[i] = 0; // ok; `p` is now has a type with bounds annotation. + } + +Run test suites to fix new run-time traps +========================================= + +Adopting ``-fbounds-safety`` may cause your program to trap if it violates +bounds safety or it has incorrect adoption. Thus, it is necessary to perform +run-time testing of your program to gain confidence that it won't trap at +run time. + +Repeat the process for each remaining file +========================================== + +Once you've done with adopting a single C file, please repeat the same process +for each remaining C file that you want to adopt. \ No newline at end of file diff --git a/clang/docs/index.rst b/clang/docs/index.rst index cc070059eede5d..349378b1efa214 100644 --- a/clang/docs/index.rst +++ b/clang/docs/index.rst @@ -40,6 +40,7 @@ Using Clang as a Compiler SanitizerStats SanitizerSpecialCaseList BoundsSafety + BoundsSafetyAdoptionGuide BoundsSafetyImplPlans ControlFlowIntegrity LTOVisibility From a939a9fd53d98f33b94f9121646d5906a2b9f598 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Wed, 22 Jan 2025 15:49:13 -0800 Subject: [PATCH 055/208] [LLDB-DAP] Send Progress update message over DAP (#123837) When testing my SBProgress DAP PR (#123826), I noticed Progress update messages aren't sent over DAP. This patch adds the lldb progress event's message to the body when sent over DAP. Before ![image](https://github.com/user-attachments/assets/404adaa8-b784-4f23-895f-cd3625fdafad) Now ![image](https://github.com/user-attachments/assets/eb1c3235-0936-4e36-96e5-0a0ee60dabb8) Tested with my [progress tester command](https://gist.github.com/Jlalond/48d85e75a91f7a137e3142e6a13d0947), testing 10 events 5 seconds apart 1-10 --- .../test/API/tools/lldb-dap/progress/Makefile | 3 + .../lldb-dap/progress/Progress_emitter.py | 84 +++++++++++++++++++ .../lldb-dap/progress/TestDAP_Progress.py | 49 +++++++++++ .../test/API/tools/lldb-dap/progress/main.cpp | 5 ++ lldb/tools/lldb-dap/ProgressEvent.cpp | 11 ++- lldb/tools/lldb-dap/ProgressEvent.h | 3 +- 6 files changed, 150 insertions(+), 5 deletions(-) create mode 100644 lldb/test/API/tools/lldb-dap/progress/Makefile create mode 100644 lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py create mode 100755 lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py create mode 100644 lldb/test/API/tools/lldb-dap/progress/main.cpp diff --git a/lldb/test/API/tools/lldb-dap/progress/Makefile b/lldb/test/API/tools/lldb-dap/progress/Makefile new file mode 100644 index 00000000000000..99998b20bcb050 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/progress/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py b/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py new file mode 100644 index 00000000000000..7f4055cab9ddda --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py @@ -0,0 +1,84 @@ +import inspect +import optparse +import shlex +import sys +import time + +import lldb + + +class ProgressTesterCommand: + program = "test-progress" + + @classmethod + def register_lldb_command(cls, debugger, module_name): + parser = cls.create_options() + cls.__doc__ = parser.format_help() + # Add any commands contained in this module to LLDB + command = "command script add -c %s.%s %s" % ( + module_name, + cls.__name__, + cls.program, + ) + debugger.HandleCommand(command) + print( + 'The "{0}" command has been installed, type "help {0}" or "{0} ' + '--help" for detailed help.'.format(cls.program) + ) + + @classmethod + def create_options(cls): + usage = "usage: %prog [options]" + description = "SBProgress testing tool" + # Opt parse is deprecated, but leaving this the way it is because it allows help formating + # Additionally all our commands use optparse right now, ideally we migrate them all in one go. + parser = optparse.OptionParser( + description=description, prog=cls.program, usage=usage + ) + + parser.add_option( + "--total", dest="total", help="Total to count up.", type="int" + ) + + parser.add_option( + "--seconds", + dest="seconds", + help="Total number of seconds to wait between increments", + type="int", + ) + + return parser + + def get_short_help(self): + return "Progress Tester" + + def get_long_help(self): + return self.help_string + + def __init__(self, debugger, unused): + self.parser = self.create_options() + self.help_string = self.parser.format_help() + + def __call__(self, debugger, command, exe_ctx, result): + command_args = shlex.split(command) + try: + (cmd_options, args) = self.parser.parse_args(command_args) + except: + result.SetError("option parsing failed") + return + + total = cmd_options.total + progress = lldb.SBProgress("Progress tester", "Detail", total, debugger) + + for i in range(1, total): + progress.Increment(1, f"Step {i}") + time.sleep(cmd_options.seconds) + + +def __lldb_init_module(debugger, dict): + # Register all classes that have a register_lldb_command method + for _name, cls in inspect.getmembers(sys.modules[__name__]): + if inspect.isclass(cls) and callable( + getattr(cls, "register_lldb_command", None) + ): + cls.register_lldb_command(debugger, __name__) diff --git a/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py b/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py new file mode 100755 index 00000000000000..36c0cef9c47143 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py @@ -0,0 +1,49 @@ +""" +Test lldb-dap output events +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +import os +import time + +import lldbdap_testcase + + +class TestDAP_progress(lldbdap_testcase.DAPTestCaseBase): + @skipIfWindows + def test_output(self): + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + progress_emitter = os.path.join(os.getcwd(), "Progress_emitter.py") + print(f"Progress emitter path: {progress_emitter}") + source = "main.cpp" + # Set breakpoint in the thread function so we can step the threads + breakpoint_ids = self.set_source_breakpoints( + source, [line_number(source, "// break here")] + ) + self.continue_to_breakpoints(breakpoint_ids) + self.dap_server.request_evaluate( + f"`command script import {progress_emitter}", context="repl" + ) + self.dap_server.request_evaluate( + "`test-progress --total 3 --seconds 1", context="repl" + ) + + self.dap_server.wait_for_event("progressEnd", 15) + # Expect at least a start, an update, and end event + # However because the underlying Progress instance is an RAII object and we can't guaruntee + # it's deterministic destruction in the python API, we verify just start and update + # otherwise this test could be flakey. + self.assertTrue(len(self.dap_server.progress_events) > 0) + start_found = False + update_found = False + for event in self.dap_server.progress_events: + event_type = event["event"] + if "progressStart" in event_type: + start_found = True + if "progressUpdate" in event_type: + update_found = True + + self.assertTrue(start_found) + self.assertTrue(update_found) diff --git a/lldb/test/API/tools/lldb-dap/progress/main.cpp b/lldb/test/API/tools/lldb-dap/progress/main.cpp new file mode 100644 index 00000000000000..3bac5d0fd6db1a --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/progress/main.cpp @@ -0,0 +1,5 @@ +int main() { + char *ptr = "unused"; + // break here + return 0; +} diff --git a/lldb/tools/lldb-dap/ProgressEvent.cpp b/lldb/tools/lldb-dap/ProgressEvent.cpp index 0dcc2ee81001d5..6a4978c055e516 100644 --- a/lldb/tools/lldb-dap/ProgressEvent.cpp +++ b/lldb/tools/lldb-dap/ProgressEvent.cpp @@ -117,6 +117,9 @@ json::Value ProgressEvent::ToJSON() const { body.try_emplace("cancellable", false); } + if (m_event_type == progressUpdate) + EmplaceSafeString(body, "message", m_message); + std::string timestamp(llvm::formatv("{0:f9}", m_creation_time.count())); EmplaceSafeString(body, "timestamp", timestamp); @@ -164,10 +167,10 @@ const ProgressEvent &ProgressEventManager::GetMostRecentEvent() const { return m_last_update_event ? *m_last_update_event : m_start_event; } -void ProgressEventManager::Update(uint64_t progress_id, uint64_t completed, - uint64_t total) { +void ProgressEventManager::Update(uint64_t progress_id, llvm::StringRef message, + uint64_t completed, uint64_t total) { if (std::optional event = ProgressEvent::Create( - progress_id, std::nullopt, completed, total, &GetMostRecentEvent())) { + progress_id, message, completed, total, &GetMostRecentEvent())) { if (event->GetEventType() == progressEnd) m_finished = true; @@ -227,7 +230,7 @@ void ProgressEventReporter::Push(uint64_t progress_id, const char *message, m_unreported_start_events.push(event_manager); } } else { - it->second->Update(progress_id, completed, total); + it->second->Update(progress_id, StringRef(message), completed, total); if (it->second->Finished()) m_event_managers.erase(it); } diff --git a/lldb/tools/lldb-dap/ProgressEvent.h b/lldb/tools/lldb-dap/ProgressEvent.h index 72317b879c803a..d1b9b9dd887cd8 100644 --- a/lldb/tools/lldb-dap/ProgressEvent.h +++ b/lldb/tools/lldb-dap/ProgressEvent.h @@ -99,7 +99,8 @@ class ProgressEventManager { /// Receive a new progress event for the start event and try to report it if /// appropriate. - void Update(uint64_t progress_id, uint64_t completed, uint64_t total); + void Update(uint64_t progress_id, llvm::StringRef message, uint64_t completed, + uint64_t total); /// \return /// \b true if a \a progressEnd event has been notified. There's no From 6e498bc2cd765f4c421d32d610bdc0effec62b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 22 Jan 2025 15:59:32 -0800 Subject: [PATCH 056/208] [flang][cuda] Handle simple device pointer allocation (#123996) --- flang/include/flang/Runtime/CUDA/pointer.h | 27 +++++++++++++ .../Optimizer/Transforms/CUFOpConversion.cpp | 27 +++++++++++-- flang/runtime/CUDA/CMakeLists.txt | 1 + flang/runtime/CUDA/pointer.cpp | 40 +++++++++++++++++++ flang/test/Fir/CUDA/cuda-allocate.fir | 11 +++++ 5 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 flang/include/flang/Runtime/CUDA/pointer.h create mode 100644 flang/runtime/CUDA/pointer.cpp diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h new file mode 100644 index 00000000000000..db5242696303f5 --- /dev/null +++ b/flang/include/flang/Runtime/CUDA/pointer.h @@ -0,0 +1,27 @@ +//===-- include/flang/Runtime/CUDA/pointer.h --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_CUDA_POINTER_H_ +#define FORTRAN_RUNTIME_CUDA_POINTER_H_ + +#include "flang/Runtime/descriptor-consts.h" +#include "flang/Runtime/entry-names.h" + +namespace Fortran::runtime::cuda { + +extern "C" { + +/// Perform allocation of the descriptor. +int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1, + bool hasStat = false, const Descriptor *errMsg = nullptr, + const char *sourceFile = nullptr, int sourceLine = 0); + +} // extern "C" + +} // namespace Fortran::runtime::cuda +#endif // FORTRAN_RUNTIME_CUDA_POINTER_H_ diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 8b8c00fa7ecfcb..23248f6d12622a 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -20,6 +20,7 @@ #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" #include "flang/Runtime/CUDA/memory.h" +#include "flang/Runtime/CUDA/pointer.h" #include "flang/Runtime/allocatable.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" @@ -161,7 +162,18 @@ struct CUFAllocateOpConversion fir::FirOpBuilder builder(rewriter, mod); mlir::Location loc = op.getLoc(); + bool isPointer = false; + + if (auto declareOp = + mlir::dyn_cast_or_null(op.getBox().getDefiningOp())) + if (declareOp.getFortranAttrs() && + bitEnumContainsAny(*declareOp.getFortranAttrs(), + fir::FortranVariableFlagsEnum::pointer)) + isPointer = true; + if (hasDoubleDescriptors(op)) { + if (isPointer) + TODO(loc, "pointer allocation with double descriptors"); // Allocation for module variable are done with custom runtime entry point // so the descriptors can be synchronized. mlir::func::FuncOp func; @@ -176,13 +188,20 @@ struct CUFAllocateOpConversion } mlir::func::FuncOp func; - if (op.getSource()) + if (op.getSource()) { + if (isPointer) + TODO(loc, "pointer allocation with source"); func = fir::runtime::getRuntimeFunc( loc, builder); - else - func = fir::runtime::getRuntimeFunc( - loc, builder); + } else { + func = + isPointer + ? fir::runtime::getRuntimeFunc( + loc, builder) + : fir::runtime::getRuntimeFunc( + loc, builder); + } return convertOpToCall(op, rewriter, func); } diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt index 3a88824826de31..23e01da72eded1 100644 --- a/flang/runtime/CUDA/CMakeLists.txt +++ b/flang/runtime/CUDA/CMakeLists.txt @@ -20,6 +20,7 @@ add_flang_library(${CUFRT_LIBNAME} kernel.cpp memmove-function.cpp memory.cpp + pointer.cpp registration.cpp ) diff --git a/flang/runtime/CUDA/pointer.cpp b/flang/runtime/CUDA/pointer.cpp new file mode 100644 index 00000000000000..0c5d3a5a6297d8 --- /dev/null +++ b/flang/runtime/CUDA/pointer.cpp @@ -0,0 +1,40 @@ +//===-- runtime/CUDA/pointer.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Runtime/CUDA/pointer.h" +#include "../stat.h" +#include "../terminator.h" +#include "flang/Runtime/pointer.h" + +#include "cuda_runtime.h" + +namespace Fortran::runtime::cuda { + +extern "C" { +RT_EXT_API_GROUP_BEGIN + +int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat, + const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + if (desc.HasAddendum()) { + Terminator terminator{sourceFile, sourceLine}; + // TODO: This require a bit more work to set the correct type descriptor + // address + terminator.Crash( + "not yet implemented: CUDA descriptor allocation with addendum"); + } + // Perform the standard allocation. + int stat{ + RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)}; + return stat; +} + +RT_EXT_API_GROUP_END + +} // extern "C" + +} // namespace Fortran::runtime::cuda diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir index 35c6e2a77a697d..2ac9498d355414 100644 --- a/flang/test/Fir/CUDA/cuda-allocate.fir +++ b/flang/test/Fir/CUDA/cuda-allocate.fir @@ -181,4 +181,15 @@ func.func @_QQallocate_stream() { // CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref // CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 + +func.func @_QPp_alloc() { + %0 = cuf.alloc !fir.box>>> {bindc_name = "complex_array", data_attr = #cuf.cuda, uniq_name = "_QFp_allocEcomplex_array"} -> !fir.ref>>>> + %4 = fir.declare %0 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFp_allocEcomplex_array"} : (!fir.ref>>>>) -> !fir.ref>>>> + %9 = cuf.allocate %4 : !fir.ref>>>> {data_attr = #cuf.cuda} -> i32 + return +} + +// CHECK-LABEL: func.func @_QPp_alloc() +// CHECK: fir.call @_FortranACUFPointerAllocate + } // end of module From 98de5dfe6a8cbb70f21de545acec4710a77294ed Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 22 Jan 2025 19:02:17 -0500 Subject: [PATCH 057/208] [mlir] Add NamedAttribute ctor taking StringRef. NFC. (#123974) This is a small QoL improvement so that we don't have to go through helpers when building `NamedAttribute`s. --- mlir/include/mlir/IR/Attributes.h | 1 + mlir/include/mlir/IR/OperationSupport.h | 4 +++- mlir/lib/IR/Attributes.cpp | 6 ++++++ mlir/lib/IR/Builders.cpp | 2 +- mlir/lib/IR/OperationSupport.cpp | 5 ----- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/mlir/include/mlir/IR/Attributes.h b/mlir/include/mlir/IR/Attributes.h index d347013295d5fc..262d31b20ab084 100644 --- a/mlir/include/mlir/IR/Attributes.h +++ b/mlir/include/mlir/IR/Attributes.h @@ -207,6 +207,7 @@ inline ::llvm::hash_code hash_value(Attribute arg) { class NamedAttribute { public: NamedAttribute(StringAttr name, Attribute value); + NamedAttribute(StringRef name, Attribute value); /// Return the name of the attribute. StringAttr getName() const; diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h index 5eb2d69134ea5f..d4035d14ab7465 100644 --- a/mlir/include/mlir/IR/OperationSupport.h +++ b/mlir/include/mlir/IR/OperationSupport.h @@ -819,7 +819,9 @@ class NamedAttrList { } /// Add an attribute with the specified name. - void append(StringRef name, Attribute attr); + void append(StringRef name, Attribute attr) { + append(NamedAttribute(name, attr)); + } /// Add an attribute with the specified name. void append(StringAttr name, Attribute attr) { diff --git a/mlir/lib/IR/Attributes.cpp b/mlir/lib/IR/Attributes.cpp index cc7a2a5e586b1c..ff1cd8432fb07d 100644 --- a/mlir/lib/IR/Attributes.cpp +++ b/mlir/lib/IR/Attributes.cpp @@ -46,6 +46,12 @@ NamedAttribute::NamedAttribute(StringAttr name, Attribute value) assert(!name.empty() && "expected valid attribute name"); } +NamedAttribute::NamedAttribute(StringRef name, Attribute value) : value(value) { + assert(value && "expected valid attribute value"); + assert(!name.empty() && "expected valid attribute name"); + this->name = StringAttr::get(value.getContext(), name); +} + StringAttr NamedAttribute::getName() const { return llvm::cast(name); } diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp index d57a7ca07ede58..16bd8201ad50a6 100644 --- a/mlir/lib/IR/Builders.cpp +++ b/mlir/lib/IR/Builders.cpp @@ -88,7 +88,7 @@ NoneType Builder::getNoneType() { return NoneType::get(context); } //===----------------------------------------------------------------------===// NamedAttribute Builder::getNamedAttr(StringRef name, Attribute val) { - return NamedAttribute(getStringAttr(name), val); + return NamedAttribute(name, val); } UnitAttr Builder::getUnitAttr() { return UnitAttr::get(context); } diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp index 957195202d78d2..1b2cda19de1e80 100644 --- a/mlir/lib/IR/OperationSupport.cpp +++ b/mlir/lib/IR/OperationSupport.cpp @@ -62,11 +62,6 @@ DictionaryAttr NamedAttrList::getDictionary(MLIRContext *context) const { return llvm::cast(dictionarySorted.getPointer()); } -/// Add an attribute with the specified name. -void NamedAttrList::append(StringRef name, Attribute attr) { - append(StringAttr::get(attr.getContext(), name), attr); -} - /// Replaces the attributes with new list of attributes. void NamedAttrList::assign(const_iterator inStart, const_iterator inEnd) { DictionaryAttr::sort(ArrayRef{inStart, inEnd}, attrs); From 2dc1c95595e409c74a8a3d743afb7898e1af3255 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 22 Jan 2025 16:08:15 -0800 Subject: [PATCH 058/208] [SandboxVec][VecUtils] Implement VecUtils::getLowest() (#124024) VecUtils::getLowest(Valse) returns the lowest instruction in the BB among Vals. If the instructions are not in the same BB, or if none of them is an instruction it returns nullptr. --- .../Vectorize/SandboxVectorizer/VecUtils.h | 29 ++++++++++ .../SandboxVectorizer/Passes/BottomUpVec.cpp | 6 +- .../SandboxVectorizer/VecUtilsTest.cpp | 57 ++++++++++++++++--- 3 files changed, 79 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h index 6cbbb396ea823f..4e3ca2bccfe6fd 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h @@ -100,6 +100,8 @@ class VecUtils { } return FixedVectorType::get(ElemTy, NumElts); } + /// \Returns the instruction in \p Instrs that is lowest in the BB. Expects + /// that all instructions are in the same BB. static Instruction *getLowest(ArrayRef Instrs) { Instruction *LowestI = Instrs.front(); for (auto *I : drop_begin(Instrs)) { @@ -108,6 +110,33 @@ class VecUtils { } return LowestI; } + /// \Returns the lowest instruction in \p Vals, or nullptr if no instructions + /// are found or if not in the same BB. + static Instruction *getLowest(ArrayRef Vals) { + // Find the first Instruction in Vals. + auto It = find_if(Vals, [](Value *V) { return isa(V); }); + // If we couldn't find an instruction return nullptr. + if (It == Vals.end()) + return nullptr; + Instruction *FirstI = cast(*It); + // Now look for the lowest instruction in Vals starting from one position + // after FirstI. + Instruction *LowestI = FirstI; + auto *LowestBB = LowestI->getParent(); + for (auto *V : make_range(std::next(It), Vals.end())) { + auto *I = dyn_cast(V); + // Skip non-instructions. + if (I == nullptr) + continue; + // If the instructions are in different BBs return nullptr. + if (I->getParent() != LowestBB) + return nullptr; + // If `LowestI` comes before `I` then `I` is the new lowest. + if (LowestI->comesBefore(I)) + LowestI = I; + } + return LowestI; + } /// If all values in \p Bndl are of the same scalar type then return it, /// otherwise return nullptr. static Type *tryGetCommonScalarType(ArrayRef Bndl) { diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index c6ab3c1942c330..8432b4c6c469ae 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -45,11 +45,7 @@ static SmallVector getOperand(ArrayRef Bndl, static BasicBlock::iterator getInsertPointAfterInstrs(ArrayRef Instrs) { - // TODO: Use the VecUtils function for getting the bottom instr once it lands. - auto *BotI = cast( - *std::max_element(Instrs.begin(), Instrs.end(), [](auto *V1, auto *V2) { - return cast(V1)->comesBefore(cast(V2)); - })); + auto *BotI = VecUtils::getLowest(Instrs); // If Bndl contains Arguments or Constants, use the beginning of the BB. return std::next(BotI->getIterator()); } diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp index 8661dcd5067c0a..b69172738d36a5 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp @@ -50,6 +50,14 @@ struct VecUtilsTest : public testing::Test { } }; +sandboxir::BasicBlock &getBasicBlockByName(sandboxir::Function &F, + StringRef Name) { + for (sandboxir::BasicBlock &BB : F) + if (BB.getName() == Name) + return BB; + llvm_unreachable("Expected to find basic block!"); +} + TEST_F(VecUtilsTest, GetNumElements) { sandboxir::Context Ctx(C); auto *ElemTy = sandboxir::Type::getInt32Ty(Ctx); @@ -415,9 +423,11 @@ TEST_F(VecUtilsTest, GetLowest) { parseIR(R"IR( define void @foo(i8 %v) { bb0: - %A = add i8 %v, %v - %B = add i8 %v, %v - %C = add i8 %v, %v + br label %bb1 +bb1: + %A = add i8 %v, 1 + %B = add i8 %v, 2 + %C = add i8 %v, 3 ret void } )IR"); @@ -425,11 +435,21 @@ define void @foo(i8 %v) { sandboxir::Context Ctx(C); auto &F = *Ctx.createFunction(&LLVMF); - auto &BB = *F.begin(); - auto It = BB.begin(); - auto *IA = &*It++; - auto *IB = &*It++; - auto *IC = &*It++; + auto &BB0 = getBasicBlockByName(F, "bb0"); + auto It = BB0.begin(); + auto *BB0I = cast(&*It++); + + auto &BB = getBasicBlockByName(F, "bb1"); + It = BB.begin(); + auto *IA = cast(&*It++); + auto *C1 = cast(IA->getOperand(1)); + auto *IB = cast(&*It++); + auto *C2 = cast(IB->getOperand(1)); + auto *IC = cast(&*It++); + auto *C3 = cast(IC->getOperand(1)); + // Check getLowest(ArrayRef) + SmallVector A({IA}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(A), IA); SmallVector ABC({IA, IB, IC}); EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC), IC); SmallVector ACB({IA, IC, IB}); @@ -438,6 +458,27 @@ define void @foo(i8 %v) { EXPECT_EQ(sandboxir::VecUtils::getLowest(CAB), IC); SmallVector CBA({IC, IB, IA}); EXPECT_EQ(sandboxir::VecUtils::getLowest(CBA), IC); + + // Check getLowest(ArrayRef) + SmallVector C1Only({C1}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1Only), nullptr); + SmallVector AOnly({IA}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AOnly), IA); + SmallVector AC1({IA, C1}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1), IA); + SmallVector C1A({C1, IA}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1A), IA); + SmallVector AC1B({IA, C1, IB}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1B), IB); + SmallVector ABC1({IA, IB, C1}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC1), IB); + SmallVector AC1C2({IA, C1, C2}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1C2), IA); + SmallVector C1C2C3({C1, C2, C3}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1C2C3), nullptr); + + SmallVector DiffBBs({BB0I, IA}); + EXPECT_EQ(sandboxir::VecUtils::getLowest(DiffBBs), nullptr); } TEST_F(VecUtilsTest, GetCommonScalarType) { From 96dbd0006c3c4c9de5f1fe4f3bbac3c74acac436 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 22 Jan 2025 15:47:01 -0800 Subject: [PATCH 059/208] [RISCV] Re-generate test checks so we pick up implicit on whole register moves. NFC --- llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir index 5bb6ce250e8db7..f7d5004e11752f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir @@ -13,7 +13,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v28m4 = PseudoVLE32_V_M4 undef $v28m4, killed $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12m2 = VMV2R_V $v28m2 + ; CHECK-NEXT: $v12m2 = VMV2R_V $v28m2, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = PseudoVLE32_V_M4 undef $v28m4, killed $x16, $noreg, 5, 0, implicit $vl, implicit $vtype $v12m2 = COPY $v28m2 @@ -61,7 +61,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v28m4 = VL4RE32_V $x16 - ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4 + ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = VL4RE32_V $x16 $v12m4 = COPY $v28m4 @@ -78,7 +78,7 @@ body: | ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $v4m4, $x0 = PseudoVLE32FF_V_M4 undef $v4m4, $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit-def $vl - ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4 + ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5, 0, implicit $vl, implicit $vtype $v4m4,$x0 = PseudoVLE32FF_V_M4 undef $v4m4, $x16, $noreg, 5, 0, implicit-def $vl @@ -99,7 +99,7 @@ body: | ; CHECK-NEXT: $v0m2 = PseudoVLE32_V_M2 undef $v0m2, $x18, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $x0 = PseudoVSETVLIX0 $x0, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v4m4 = PseudoVLE32_V_M4 undef $v4m4, killed $x18, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4 + ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = PseudoVLE32_V_M4 undef $v28m4, killed $x16, $noreg, 5, 0, implicit $vl, implicit $vtype $x15 = PseudoVSETVLI $x17, 73, implicit-def $vl, implicit-def $vtype @@ -145,7 +145,7 @@ body: | ; CHECK-NEXT: $v28m4 = PseudoVLE32_V_M4 undef $v28m4, killed $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $x0 = PseudoVSETVLIX0 $x0, 73 /* e16, m2, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v0m2 = PseudoVLE32_V_M2 undef $v0m2, $x18, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4 + ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = PseudoVLE32_V_M4 undef $v28m4, killed $x16, $noreg, 5, 0, implicit $vl, implicit $vtype $x0 = PseudoVSETVLIX0 $x0, 73, implicit-def $vl, implicit-def $vtype @@ -165,7 +165,7 @@ body: | ; CHECK-NEXT: $v26m2 = PseudoVLE16_V_M2 undef $v26m2, killed $x16, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $v8m2 = PseudoVLE16_V_M2 undef $v8m2, killed $x17, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: early-clobber $v28m4 = PseudoVWADD_VV_M2 undef $v28m4, $v26m2, $v8m2, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12m2 = VMV2R_V $v28m2 + ; CHECK-NEXT: $v12m2 = VMV2R_V $v28m2, implicit $vtype $x15 = PseudoVSETIVLI 4, 73, implicit-def $vl, implicit-def $vtype $v26m2 = PseudoVLE16_V_M2 undef $v26m2, killed $x16, $noreg, 4, 0, implicit $vl, implicit $vtype $v8m2 = PseudoVLE16_V_M2 undef $v8m2, killed $x17, $noreg, 4, 0, implicit $vl, implicit $vtype @@ -185,7 +185,7 @@ body: | ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v28m4 = PseudoVLE32_V_M4 undef $v28m4, killed $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $x0 = PseudoVSETVLIX0 $x0, 74 /* e16, m4, ta, mu */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4 + ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = PseudoVLE32_V_M4 undef $v28m4, killed $x16, $noreg, 5, 0, implicit $vl, implicit $vtype $x0 = PseudoVSETVLIX0 $x0, 74, implicit-def $vl, implicit-def $vtype @@ -202,7 +202,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x11 = PseudoVSETIVLI 1, 64 /* e8, m1, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v8 = PseudoVWREDSUM_VS_M1_E8 killed renamable $v8, killed renamable $v26, killed renamable $v27, 1, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v26 = VMV1R_V killed $v8 + ; CHECK-NEXT: $v26 = VMV1R_V killed $v8, implicit $vtype ; CHECK-NEXT: $x10 = PseudoVSETVLI killed renamable $x10, 75 /* e16, m8, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v8m8 = VL8RE8_V killed $x10 $x11 = PseudoVSETIVLI 1, 64, implicit-def $vl, implicit-def $vtype @@ -222,7 +222,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 80 /* e32, m1, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v8_v9 = PseudoVLSEG2E32_V_M1 undef $v8_v9, killed $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v10 = VMV1R_V $v8 + ; CHECK-NEXT: $v10 = VMV1R_V $v8, implicit $vtype $x15 = PseudoVSETVLI $x14, 80, implicit-def $vl, implicit-def $vtype $v8_v9 = PseudoVLSEG2E32_V_M1 undef $v8_v9, killed $x16, $noreg, 5, 0, implicit $vl, implicit $vtype $v10 = COPY $v8 @@ -238,7 +238,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 80 /* e32, m1, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v8_v9 = PseudoVLSEG2E32_V_M1 undef $v8_v9, killed $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v10m2 = VMV2R_V $v8m2 + ; CHECK-NEXT: $v10m2 = VMV2R_V $v8m2, implicit $vtype $x15 = PseudoVSETVLI $x14, 80, implicit-def $vl, implicit-def $vtype $v8_v9 = PseudoVLSEG2E32_V_M1 undef $v8_v9, killed $x16, $noreg, 5, 0, implicit $vl, implicit $vtype $v10_v11 = COPY $v8_v9 @@ -254,7 +254,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 87 /* e32, mf2, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v28 = PseudoVLE32_V_MF2 undef $v28, killed $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12 = VMV1R_V $v28 + ; CHECK-NEXT: $v12 = VMV1R_V $v28, implicit $vtype $x15 = PseudoVSETVLI $x14, 87, implicit-def $vl, implicit-def $vtype $v28 = PseudoVLE32_V_MF2 undef $v28, killed $x16, $noreg, 5, 0, implicit $vl, implicit $vtype $v12 = COPY $v28 @@ -272,7 +272,7 @@ body: | ; CHECK-NEXT: $v8_v9_v10_v11_v12_v13_v14_v15 = PseudoVLSEG8E32_V_M1 undef $v8_v9_v10_v11_v12_v13_v14_v15, killed $x12, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $x0 = PseudoVSETIVLI 10, 80 /* e32, m1, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v15 = PseudoVLE32_V_M1 undef $v15, killed $x16, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype, implicit killed $v8_v9_v10_v11_v12_v13_v14_v15, implicit-def $v8_v9_v10_v11_v12_v13_v14_v15 - ; CHECK-NEXT: $v24m8 = VMV8R_V killed $v8m8 + ; CHECK-NEXT: $v24m8 = VMV8R_V killed $v8m8, implicit $vtype $x0 = PseudoVSETVLI $x14, 80, implicit-def $vl, implicit-def $vtype $v8_v9_v10_v11_v12_v13_v14_v15 = PseudoVLSEG8E32_V_M1 undef $v8_v9_v10_v11_v12_v13_v14_v15, killed $x12, $noreg, 5, 0, implicit $vl, implicit $vtype $x0 = PseudoVSETIVLI 10, 80, implicit-def $vl, implicit-def $vtype @@ -290,9 +290,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x0 = PseudoVSETVLI $x10, 201 /* e16, m2, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v10m2 = PseudoVLE16_V_M2 undef $v10m2, killed $x11, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v10 = VMV1R_V $v8 - ; CHECK-NEXT: $v11 = VMV1R_V $v9 - ; CHECK-NEXT: $v12m2 = VMV2R_V $v10m2 + ; CHECK-NEXT: $v10 = VMV1R_V $v8, implicit $vtype + ; CHECK-NEXT: $v11 = VMV1R_V $v9, implicit $vtype + ; CHECK-NEXT: $v12m2 = VMV2R_V $v10m2, implicit $vtype $x0 = PseudoVSETVLI $x10, 201, implicit-def $vl, implicit-def $vtype $v10m2 = PseudoVLE16_V_M2 undef $v10m2, killed $x11, $noreg, 4, 0, implicit $vl, implicit $vtype $v10 = COPY $v8 From 9fbf5cfebcd770fbe0e453f36ee7c74809339f18 Mon Sep 17 00:00:00 2001 From: Ben Langmuir Date: Wed, 22 Jan 2025 16:24:56 -0800 Subject: [PATCH 060/208] [clang][modules] Partially revert 48d0eb518 to fix -gmodules output (#124003) With the changes in 48d0eb518, the CodeGenOptions used to emit .pcm files with -fmodule-format=obj (-gmodules) were the ones from the original invocation, rather than the ones specifically crafted for outputting the pcm. This was causing the pcm to be written with only the debug info and without the __clangast section in some cases (e.g. -O2). This unforunately was not covered by existing tests, because compiling and loading a module within a single compilation load the ast content from the in-memory module cache rather than reading it from the pcm file that was written. This broke bootstrapping a build of clang with modules enabled on Darwin. rdar://143418834 --- clang/include/clang/CodeGen/BackendUtil.h | 4 ++-- clang/lib/CodeGen/BackendUtil.cpp | 13 +++++++------ clang/lib/CodeGen/CodeGenAction.cpp | 8 +++++--- .../CodeGen/ObjectFilePCHContainerWriter.cpp | 8 ++++---- clang/test/Modules/gmodules-codegenopts.c | 18 ++++++++++++++++++ 5 files changed, 36 insertions(+), 15 deletions(-) create mode 100644 clang/test/Modules/gmodules-codegenopts.c diff --git a/clang/include/clang/CodeGen/BackendUtil.h b/clang/include/clang/CodeGen/BackendUtil.h index 78d1e5ee8e6d59..92e0d13bf25b69 100644 --- a/clang/include/clang/CodeGen/BackendUtil.h +++ b/clang/include/clang/CodeGen/BackendUtil.h @@ -39,8 +39,8 @@ enum BackendAction { Backend_EmitObj ///< Emit native object files }; -void emitBackendOutput(CompilerInstance &CI, StringRef TDesc, llvm::Module *M, - BackendAction Action, +void emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts, + StringRef TDesc, llvm::Module *M, BackendAction Action, llvm::IntrusiveRefCntPtr VFS, std::unique_ptr OS, BackendConsumer *BC = nullptr); diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index f60f8672e6a0b8..3e65eeb3755d2f 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -206,9 +206,10 @@ class EmitAssemblyHelper { } public: - EmitAssemblyHelper(CompilerInstance &CI, llvm::Module *M, + EmitAssemblyHelper(CompilerInstance &CI, CodeGenOptions &CGOpts, + llvm::Module *M, IntrusiveRefCntPtr VFS) - : CI(CI), Diags(CI.getDiagnostics()), CodeGenOpts(CI.getCodeGenOpts()), + : CI(CI), Diags(CI.getDiagnostics()), CodeGenOpts(CGOpts), TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()), TheModule(M), VFS(std::move(VFS)), TargetTriple(TheModule->getTargetTriple()) {} @@ -1364,14 +1365,14 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex, } } -void clang::emitBackendOutput(CompilerInstance &CI, StringRef TDesc, - llvm::Module *M, BackendAction Action, +void clang::emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts, + StringRef TDesc, llvm::Module *M, + BackendAction Action, IntrusiveRefCntPtr VFS, std::unique_ptr OS, BackendConsumer *BC) { llvm::TimeTraceScope TimeScope("Backend"); DiagnosticsEngine &Diags = CI.getDiagnostics(); - const auto &CGOpts = CI.getCodeGenOpts(); std::unique_ptr EmptyModule; if (!CGOpts.ThinLTOIndexFile.empty()) { @@ -1411,7 +1412,7 @@ void clang::emitBackendOutput(CompilerInstance &CI, StringRef TDesc, } } - EmitAssemblyHelper AsmHelper(CI, M, VFS); + EmitAssemblyHelper AsmHelper(CI, CGOpts, M, VFS); AsmHelper.emitAssembly(Action, std::move(OS), BC); // Verify clang's TargetInfo DataLayout against the LLVM TargetMachine's diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp index 15311fb2078101..7aa3639cabf392 100644 --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -312,7 +312,8 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) { EmbedBitcode(getModule(), CodeGenOpts, llvm::MemoryBufferRef()); - emitBackendOutput(CI, C.getTargetInfo().getDataLayoutString(), getModule(), + emitBackendOutput(CI, CI.getCodeGenOpts(), + C.getTargetInfo().getDataLayoutString(), getModule(), Action, FS, std::move(AsmOutStream), this); Ctx.setDiagnosticHandler(std::move(OldDiagnosticHandler)); @@ -1173,8 +1174,9 @@ void CodeGenAction::ExecuteAction() { std::unique_ptr OptRecordFile = std::move(*OptRecordFileOrErr); - emitBackendOutput(CI, CI.getTarget().getDataLayoutString(), TheModule.get(), - BA, CI.getFileManager().getVirtualFileSystemPtr(), + emitBackendOutput(CI, CI.getCodeGenOpts(), + CI.getTarget().getDataLayoutString(), TheModule.get(), BA, + CI.getFileManager().getVirtualFileSystemPtr(), std::move(OS)); if (OptRecordFile) OptRecordFile->keep(); diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp index 02635ce235a12b..788c8b932ab524 100644 --- a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp +++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp @@ -322,16 +322,16 @@ class PCHContainerGenerator : public ASTConsumer { // Print the IR for the PCH container to the debug output. llvm::SmallString<0> Buffer; clang::emitBackendOutput( - CI, Ctx.getTargetInfo().getDataLayoutString(), M.get(), + CI, CodeGenOpts, Ctx.getTargetInfo().getDataLayoutString(), M.get(), BackendAction::Backend_EmitLL, FS, std::make_unique(Buffer)); llvm::dbgs() << Buffer; }); // Use the LLVM backend to emit the pch container. - clang::emitBackendOutput(CI, Ctx.getTargetInfo().getDataLayoutString(), - M.get(), BackendAction::Backend_EmitObj, FS, - std::move(OS)); + clang::emitBackendOutput(CI, CodeGenOpts, + Ctx.getTargetInfo().getDataLayoutString(), M.get(), + BackendAction::Backend_EmitObj, FS, std::move(OS)); // Free the memory for the temporary buffer. llvm::SmallVector Empty; diff --git a/clang/test/Modules/gmodules-codegenopts.c b/clang/test/Modules/gmodules-codegenopts.c new file mode 100644 index 00000000000000..417bbb9cecefc1 --- /dev/null +++ b/clang/test/Modules/gmodules-codegenopts.c @@ -0,0 +1,18 @@ +// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}} +// Check that the output from -gmodules can be loaded back by the compiler in +// the presence of certain options like optimization level that could break +// output. Note: without compiling twice the module is loaded from the in-memory +// module cache not load it from the object container. + +// RUN: rm -rf %t +// RUN: %clang_cc1 -x objective-c -fmodules -fmodule-format=obj \ +// RUN: -fimplicit-module-maps -fmodules-cache-path=%t %s \ +// RUN: -I %S/Inputs -verify -O2 + +// Compile again, confirming we can load the module. +// RUN: %clang_cc1 -x objective-c -fmodules -fmodule-format=obj \ +// RUN: -fimplicit-module-maps -fmodules-cache-path=%t %s \ +// RUN: -I %S/Inputs -verify -O2 + +@import DebugObjC; +// expected-no-diagnostics \ No newline at end of file From 8f45452c5309d0ae59dd383de6dae1aa4eabbb9c Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 22 Jan 2025 16:29:43 -0800 Subject: [PATCH 061/208] workflows/release-binaries: Restrict jobs based on owner instead of repo (#123797) Not really any functional change, just a clean up that could make it easier to share snippets with other repos. --- .github/workflows/release-binaries.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 29be8195da68ac..f9a264e7cf48f1 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -49,7 +49,7 @@ jobs: prepare: name: Prepare to build binaries runs-on: ${{ inputs.runs-on }} - if: github.repository == 'llvm/llvm-project' + if: github.repository_owner == 'llvm' outputs: release-version: ${{ steps.vars.outputs.release-version }} ref: ${{ steps.vars.outputs.ref }} @@ -177,7 +177,7 @@ jobs: build-release-package: name: "Build Release Package" needs: prepare - if: github.repository == 'llvm/llvm-project' + if: github.repository_owner == 'llvm' runs-on: ${{ needs.prepare.outputs.build-runs-on }} steps: @@ -327,7 +327,7 @@ jobs: - prepare - build-release-package if: >- - github.repository == 'llvm/llvm-project' + github.repository_owner == 'llvm' runs-on: ${{ needs.prepare.outputs.test-runs-on }} steps: - name: Checkout Actions From 8110af75b1500be2313e523a2d2da6bb7806b700 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 22 Jan 2025 16:38:10 -0800 Subject: [PATCH 062/208] [SandboxVec][BottomUpVec] Fix codegen when packing constants. (#124033) Before this patch packing a bundle of constants would crash because `getInsertPointAfterInstrs()` expected instructions. This patch fixes this. --- .../SandboxVectorizer/Passes/BottomUpVec.h | 12 +++-- .../SandboxVectorizer/Passes/BottomUpVec.cpp | 44 ++++++++++++------- .../test/Transforms/SandboxVectorizer/pack.ll | 16 +++++++ 3 files changed, 52 insertions(+), 20 deletions(-) create mode 100644 llvm/test/Transforms/SandboxVectorizer/pack.ll diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h index b463b8acf4c86e..147a86de4e34ec 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h @@ -38,16 +38,20 @@ class BottomUpVec final : public FunctionPass { /// collected during vectorization. void tryEraseDeadInstrs(); /// Creates a shuffle instruction that shuffles \p VecOp according to \p Mask. - Value *createShuffle(Value *VecOp, const ShuffleMask &Mask); - /// Packs all elements of \p ToPack into a vector and returns that vector. - Value *createPack(ArrayRef ToPack); + /// \p UserBB is the block of the user bundle. + Value *createShuffle(Value *VecOp, const ShuffleMask &Mask, + BasicBlock *UserBB); + /// Packs all elements of \p ToPack into a vector and returns that vector. \p + /// UserBB is the block of the user bundle. + Value *createPack(ArrayRef ToPack, BasicBlock *UserBB); /// After we create vectors for groups of instructions, the original /// instructions are potentially dead and may need to be removed. This /// function helps collect these instructions (along with the pointer operands /// for loads/stores) so that they can be cleaned up later. void collectPotentiallyDeadInstrs(ArrayRef Bndl); /// Recursively try to vectorize \p Bndl and its operands. - Value *vectorizeRec(ArrayRef Bndl, unsigned Depth); + Value *vectorizeRec(ArrayRef Bndl, ArrayRef UserBndl, + unsigned Depth); /// Entry point for vectorization starting from \p Seeds. bool tryVectorize(ArrayRef Seeds); diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 8432b4c6c469ae..18c3b375c92a23 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -43,10 +43,15 @@ static SmallVector getOperand(ArrayRef Bndl, return Operands; } -static BasicBlock::iterator -getInsertPointAfterInstrs(ArrayRef Instrs) { - auto *BotI = VecUtils::getLowest(Instrs); - // If Bndl contains Arguments or Constants, use the beginning of the BB. +/// \Returns the BB iterator after the lowest instruction in \p Vals, or the top +/// of BB if no instruction found in \p Vals. +static BasicBlock::iterator getInsertPointAfterInstrs(ArrayRef Vals, + BasicBlock *BB) { + auto *BotI = VecUtils::getLowest(Vals); + if (BotI == nullptr) + // We are using BB->begin() as the fallback insert point if `ToPack` did + // not contain instructions. + return BB->begin(); return std::next(BotI->getIterator()); } @@ -61,7 +66,8 @@ Value *BottomUpVec::createVectorInstr(ArrayRef Bndl, Type *ScalarTy = VecUtils::getElementType(Utils::getExpectedType(Bndl[0])); auto *VecTy = VecUtils::getWideType(ScalarTy, VecUtils::getNumLanes(Bndl)); - BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(Bndl); + BasicBlock::iterator WhereIt = getInsertPointAfterInstrs( + Bndl, cast(Bndl[0])->getParent()); auto Opcode = cast(Bndl[0])->getOpcode(); switch (Opcode) { @@ -175,14 +181,15 @@ void BottomUpVec::tryEraseDeadInstrs() { DeadInstrCandidates.clear(); } -Value *BottomUpVec::createShuffle(Value *VecOp, const ShuffleMask &Mask) { - BasicBlock::iterator WhereIt = getInsertPointAfterInstrs({VecOp}); +Value *BottomUpVec::createShuffle(Value *VecOp, const ShuffleMask &Mask, + BasicBlock *UserBB) { + BasicBlock::iterator WhereIt = getInsertPointAfterInstrs({VecOp}, UserBB); return ShuffleVectorInst::create(VecOp, VecOp, Mask, WhereIt, VecOp->getContext(), "VShuf"); } -Value *BottomUpVec::createPack(ArrayRef ToPack) { - BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(ToPack); +Value *BottomUpVec::createPack(ArrayRef ToPack, BasicBlock *UserBB) { + BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(ToPack, UserBB); Type *ScalarTy = VecUtils::getCommonScalarType(ToPack); unsigned Lanes = VecUtils::getNumLanes(ToPack); @@ -258,8 +265,12 @@ void BottomUpVec::collectPotentiallyDeadInstrs(ArrayRef Bndl) { } } -Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { +Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, + ArrayRef UserBndl, unsigned Depth) { Value *NewVec = nullptr; + auto *UserBB = !UserBndl.empty() + ? cast(UserBndl.front())->getParent() + : cast(Bndl[0])->getParent(); const auto &LegalityRes = Legality->canVectorize(Bndl); switch (LegalityRes.getSubclassID()) { case LegalityResultID::Widen: { @@ -272,7 +283,7 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { break; case Instruction::Opcode::Store: { // Don't recurse towards the pointer operand. - auto *VecOp = vectorizeRec(getOperand(Bndl, 0), Depth + 1); + auto *VecOp = vectorizeRec(getOperand(Bndl, 0), Bndl, Depth + 1); VecOperands.push_back(VecOp); VecOperands.push_back(cast(I)->getPointerOperand()); break; @@ -280,7 +291,7 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { default: // Visit all operands. for (auto OpIdx : seq(I->getNumOperands())) { - auto *VecOp = vectorizeRec(getOperand(Bndl, OpIdx), Depth + 1); + auto *VecOp = vectorizeRec(getOperand(Bndl, OpIdx), Bndl, Depth + 1); VecOperands.push_back(VecOp); } break; @@ -301,7 +312,7 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { auto *VecOp = cast(LegalityRes).getVector(); const ShuffleMask &Mask = cast(LegalityRes).getMask(); - NewVec = createShuffle(VecOp, Mask); + NewVec = createShuffle(VecOp, Mask, UserBB); break; } case LegalityResultID::DiamondReuseMultiInput: { @@ -315,7 +326,8 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { if (auto *I = dyn_cast(ElmDescr.getValue())) DescrInstrs.push_back(I); } - auto WhereIt = getInsertPointAfterInstrs(DescrInstrs); + BasicBlock::iterator WhereIt = + getInsertPointAfterInstrs(DescrInstrs, UserBB); Value *LastV = PoisonValue::get(ResTy); for (auto [Lane, ElmDescr] : enumerate(Descr.getDescrs())) { @@ -342,7 +354,7 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { // If we can't vectorize the seeds then just return. if (Depth == 0) return nullptr; - NewVec = createPack(Bndl); + NewVec = createPack(Bndl, UserBB); break; } } @@ -352,7 +364,7 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, unsigned Depth) { bool BottomUpVec::tryVectorize(ArrayRef Bndl) { DeadInstrCandidates.clear(); Legality->clear(); - vectorizeRec(Bndl, /*Depth=*/0); + vectorizeRec(Bndl, {}, /*Depth=*/0); tryEraseDeadInstrs(); return Change; } diff --git a/llvm/test/Transforms/SandboxVectorizer/pack.ll b/llvm/test/Transforms/SandboxVectorizer/pack.ll new file mode 100644 index 00000000000000..6607b31c021941 --- /dev/null +++ b/llvm/test/Transforms/SandboxVectorizer/pack.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s + +define void @pack_constants(ptr %ptr) { +; CHECK-LABEL: define void @pack_constants( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i32 0 +; CHECK-NEXT: store <2 x i8> , ptr [[PTR0]], align 1 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i32 0 + %ptr1 = getelementptr i8, ptr %ptr, i32 1 + store i8 0, ptr %ptr0 + store i8 1, ptr %ptr1 + ret void +} From d5457e4c1619e5dbeefd49841e284cbc24d35cb4 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 22 Jan 2025 16:52:09 -0800 Subject: [PATCH 063/208] [NFC][Index] Disable LSAN on crash recovery tests (#124035) Avoiding leaks in such cases is very hard. There are similar suppression in other Index tests. --- clang/test/Index/crash-recovery-modules.m | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/test/Index/crash-recovery-modules.m b/clang/test/Index/crash-recovery-modules.m index 00020e0e8162c9..16cf4b396b47a7 100644 --- a/clang/test/Index/crash-recovery-modules.m +++ b/clang/test/Index/crash-recovery-modules.m @@ -1,10 +1,12 @@ +// RUN: export LSAN_OPTIONS=detect_leaks=0 + // Clear out the module cache entirely, so we start from nothing. // RUN: rm -rf %t // Parse the file, such that building the module will cause Clang to crash. // RUN: env CINDEXTEST_FAILONERROR=1 not c-index-test -test-load-source all -fmodules -fmodules-cache-path=%t -Xclang -fdisable-module-hash -I %S/Inputs/Headers -DCRASH %s > /dev/null 2> %t.err // RUN: FileCheck < %t.err -check-prefix=CHECK-CRASH %s -// CHECK-CRASH: crash-recovery-modules.m:16:9:{16:2-16:14}: fatal error: could not build module 'Crash' +// CHECK-CRASH: crash-recovery-modules.m:[[@LINE+9]]:9:{[[@LINE+9]]:2-[[@LINE+9]]:14}: fatal error: could not build module 'Crash' // Parse the file again, without crashing, to make sure that // subsequent parses do the right thing. From eaaac050588ec67afcdbb347df5597458a9b10d1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 22 Jan 2025 17:19:17 -0800 Subject: [PATCH 064/208] [Sema] Fix a warning This patch fixes: clang/lib/Sema/SemaSYCL.cpp:428:25: error: unused variable 'SKI' [-Werror,-Wunused-variable] --- clang/lib/Sema/SemaSYCL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 5efbd66c66f8d4..ddd92782366b5f 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -429,6 +429,7 @@ StmtResult SemaSYCL::BuildSYCLKernelCallStmt(FunctionDecl *FD, getASTContext().getSYCLKernelInfo(SKEPAttr->getKernelName()); assert(declaresSameEntity(SKI.getKernelEntryPointDecl(), FD) && "SYCL kernel name conflict"); + (void)SKI; using ParmDeclMap = OutlinedFunctionDeclBodyInstantiator::ParmDeclMap; ParmDeclMap ParmMap; From a9d2834508e276d0a3cc09ac549132b56796e87f Mon Sep 17 00:00:00 2001 From: Hua Tian Date: Thu, 23 Jan 2025 09:39:03 +0800 Subject: [PATCH 065/208] [llvm][CodeGen] Fix the issue caused by live interval checking in window scheduler (#123184) At some corner cases, the cloned MI still retains an old slot index, which leads to the compiler crashing. This patch update the slot index map before delete the recycled MI. https://github.com/llvm/llvm-project/issues/123165 --- llvm/include/llvm/CodeGen/TargetInstrInfo.h | 2 +- llvm/lib/CodeGen/ModuloSchedule.cpp | 2 +- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 1 - llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 2 - llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp | 7 +- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 6 +- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 - .../swp-ws-live-intervals-issue123165.mir | 86 +++++++++++++++++++ 8 files changed, 99 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/swp-ws-live-intervals-issue123165.mir diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index bc90364875b682..a91cb0d4f603b3 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -810,7 +810,7 @@ class TargetInstrInfo : public MCInstrInfo { /// /// Once this function is called, no other functions on this object are /// valid; the loop has been removed. - virtual void disposed() = 0; + virtual void disposed(LiveIntervals *LIS = nullptr) {} /// Return true if the target can expand pipelined schedule with modulo /// variable expansion. diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index 414c8cd71809db..d99b6ace01000d 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -899,7 +899,7 @@ void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB, LastEpi->eraseFromParent(); } if (LastPro == KernelBB) { - LoopInfo->disposed(); + LoopInfo->disposed(&LIS); NewKernel = nullptr; } LastPro->clear(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 7ccd65b1052200..17dd8a073eff0f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -10143,7 +10143,6 @@ class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { void adjustTripCount(int TripCountAdjust) override {} - void disposed() override {} bool isMVEExpanderSupported() override { return true; } }; } // namespace diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index c167c1897bc912..839b7e81f8998f 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6850,8 +6850,6 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { void setPreheader(MachineBasicBlock *NewPreheader) override {} void adjustTripCount(int TripCountAdjust) override {} - - void disposed() override {} }; void ARMPipelinerLoopInfo::bumpCrossIterationPressure(RegPressureTracker &RPT, diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index f30c45e820612c..c54114513ac03c 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" @@ -795,7 +796,11 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { Loop->getOperand(1).setReg(NewLoopCount); } - void disposed() override { Loop->eraseFromParent(); } + void disposed(LiveIntervals *LIS) override { + if (LIS) + LIS->RemoveMachineInstrFromMaps(*Loop); + Loop->eraseFromParent(); + } }; } // namespace diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 04b58bba7251e0..3aef6f2c893fa2 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -5693,7 +5693,11 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { // so we don't need to generate any thing here. } - void disposed() override { + void disposed(LiveIntervals *LIS) override { + if (LIS) { + LIS->RemoveMachineInstrFromMaps(*Loop); + LIS->RemoveMachineInstrFromMaps(*LoopCount); + } Loop->eraseFromParent(); // Ensure the loop setup instruction is deleted too. LoopCount->eraseFromParent(); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 471cd15ee9c870..e6678a795c807f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4277,8 +4277,6 @@ class RISCVPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { void setPreheader(MachineBasicBlock *NewPreheader) override {} void adjustTripCount(int TripCountAdjust) override {} - - void disposed() override {} }; } // namespace diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-live-intervals-issue123165.mir b/llvm/test/CodeGen/Hexagon/swp-ws-live-intervals-issue123165.mir new file mode 100644 index 00000000000000..779a414b689fe5 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-live-intervals-issue123165.mir @@ -0,0 +1,86 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc --mtriple=hexagon %s -run-pass=pipeliner -o -| FileCheck %s + +... +--- +name: test_swp_ws_live_intervals +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_swp_ws_live_intervals + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:intregs = COPY $r0 + ; CHECK-NEXT: J2_loop0i %bb.1, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[L2_loadri_io:%[0-9]+]]:intregs = L2_loadri_io [[COPY]], 0 + ; CHECK-NEXT: [[L2_loadrub_io:%[0-9]+]]:intregs = L2_loadrub_io [[L2_loadri_io]], 0 + ; CHECK-NEXT: [[PS_loadriabs:%[0-9]+]]:intregs = PS_loadriabs 0 + ; CHECK-NEXT: S2_storerb_io [[PS_loadriabs]], 0, [[L2_loadrub_io]] + ; CHECK-NEXT: ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[A2_tfrsi:%[0-9]+]]:intregs = A2_tfrsi 0 + ; CHECK-NEXT: [[A2_tfrsi1:%[0-9]+]]:intregs = A2_tfrsi -1 + ; CHECK-NEXT: J2_jump %bb.5, implicit-def $pc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: S2_storeri_io [[COPY]], 0, %18 + ; CHECK-NEXT: PS_jmpret $r31, implicit-def dead $pc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[A2_addi:%[0-9]+]]:intregs = A2_addi [[A2_tfrsi1]], 1 + ; CHECK-NEXT: J2_jump %bb.7, implicit-def $pc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:intregs = PHI [[A2_tfrsi]], %bb.5 + ; CHECK-NEXT: J2_jump %bb.3, implicit-def $pc + bb.0: + successors: %bb.1(0x80000000) + liveins: $r0 + + %0:intregs = COPY $r0 + J2_loop0i %bb.1, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.1: + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + %1:intregs = L2_loadri_io %0, 0 + %2:intregs = L2_loadrub_io killed %1, 0 + %3:intregs = PS_loadriabs 0 + S2_storerb_io killed %3, 0, killed %2 + ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + + bb.2: + successors: %bb.4(0x80000000) + + %4:intregs = A2_tfrsi 0 + %5:intregs = A2_tfrsi -1 + J2_loop0i %bb.4, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.4, implicit-def $pc + + bb.3: + S2_storeri_io %0, 0, %6 + PS_jmpret $r31, implicit-def dead $pc + + bb.4: + successors: %bb.3(0x04000000), %bb.4(0x7c000000) + + %7:intregs = PHI %5, %bb.2, %8, %bb.4 + %6:intregs = PHI %4, %bb.2, %9, %bb.4 + %8:intregs = A2_addi %7, 1 + %9:intregs = S2_setbit_i %8, 0 + ENDLOOP0 %bb.4, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.3, implicit-def $pc + +... From 847acbbc529133b2300721a809751891200f37f5 Mon Sep 17 00:00:00 2001 From: epitavy <32581827+epitavy@users.noreply.github.com> Date: Thu, 23 Jan 2025 02:40:38 +0100 Subject: [PATCH 066/208] [ExceptionDemo] Transition example from MCJIT to ORC and fix errors (#92504) ExceptionDemo has been broken for some time. This patch fixes the compilation errors and moves the example from MCJIT to ORC. --- llvm/examples/ExceptionDemo/CMakeLists.txt | 4 +- llvm/examples/ExceptionDemo/ExceptionDemo.cpp | 183 +++++++----------- 2 files changed, 76 insertions(+), 111 deletions(-) diff --git a/llvm/examples/ExceptionDemo/CMakeLists.txt b/llvm/examples/ExceptionDemo/CMakeLists.txt index 6c125fe20fb6fb..5d8e4c7d439b30 100644 --- a/llvm/examples/ExceptionDemo/CMakeLists.txt +++ b/llvm/examples/ExceptionDemo/CMakeLists.txt @@ -1,9 +1,7 @@ set(LLVM_LINK_COMPONENTS Core ExecutionEngine - MC - MCJIT - RuntimeDyld + ORCJIT Support Target nativecodegen diff --git a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp index 58367a2319981d..ae334419c8f814 100644 --- a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp +++ b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp @@ -49,8 +49,9 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/BinaryFormat/Dwarf.h" -#include "llvm/ExecutionEngine/MCJIT.h" -#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" +#include "llvm/ExecutionEngine/Orc/LLJIT.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" @@ -84,6 +85,8 @@ #define USE_GLOBAL_STR_CONSTS true #endif +llvm::ExitOnError ExitOnErr; + // // Example types // @@ -142,6 +145,7 @@ static llvm::ConstantInt *ourExceptionCaughtState; typedef std::vector ArgNames; typedef std::vector ArgTypes; +typedef llvm::ArrayRef TypeArray; // // Code Generation Utilities @@ -891,13 +895,10 @@ void generateStringPrint(llvm::LLVMContext &context, /// generated, and is used to hold the constant string. A value of /// false indicates that the constant string will be stored on the /// stack. -void generateIntegerPrint(llvm::LLVMContext &context, - llvm::Module &module, +void generateIntegerPrint(llvm::LLVMContext &context, llvm::Module &module, llvm::IRBuilder<> &builder, - llvm::Function &printFunct, - llvm::Value &toPrint, - std::string format, - bool useGlobal = true) { + llvm::Function &printFunct, llvm::Value *toPrint, + std::string format, bool useGlobal = true) { llvm::Constant *stringConstant = llvm::ConstantDataArray::getString(context, format); llvm::Value *stringVar; @@ -919,10 +920,9 @@ void generateIntegerPrint(llvm::LLVMContext &context, llvm::Value *cast = builder.CreateBitCast(stringVar, builder.getPtrTy()); - builder.CreateCall(&printFunct, {&toPrint, cast}); + builder.CreateCall(&printFunct, {toPrint, cast}); } - /// Generates code to handle finally block type semantics: always runs /// regardless of whether a thrown exception is passing through or the /// parent function is simply exiting. In addition to printing some state @@ -996,10 +996,10 @@ static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context, bufferToPrint.str(), USE_GLOBAL_STR_CONSTS); - llvm::SwitchInst *theSwitch = builder.CreateSwitch(builder.CreateLoad( - *exceptionCaughtFlag), - &terminatorBlock, - 2); + llvm::SwitchInst *theSwitch = builder.CreateSwitch( + builder.CreateLoad(ourExceptionNotThrownState->getType(), + *exceptionCaughtFlag), + &terminatorBlock, 2); theSwitch->addCase(ourExceptionCaughtState, &terminatorBlock); theSwitch->addCase(ourExceptionThrownState, &unwindResumeBlock); @@ -1185,7 +1185,7 @@ static llvm::Function *createCatchWrappedInvokeFunction( // Note: function handles NULL exceptions builder.CreateCall(deleteOurException, - builder.CreateLoad(exceptionStorage)); + builder.CreateLoad(builder.getPtrTy(), exceptionStorage)); builder.CreateRetVoid(); // Normal Block @@ -1205,7 +1205,8 @@ static llvm::Function *createCatchWrappedInvokeFunction( builder.SetInsertPoint(unwindResumeBlock); - builder.CreateResume(builder.CreateLoad(caughtResultStorage)); + builder.CreateResume( + builder.CreateLoad(ourCaughtResultType, caughtResultStorage)); // Exception Block @@ -1240,11 +1241,9 @@ static llvm::Function *createCatchWrappedInvokeFunction( // Retrieve exception_class member from thrown exception // (_Unwind_Exception instance). This member tells us whether or not // the exception is foreign. - llvm::Value *unwindExceptionClass = - builder.CreateLoad(builder.CreateStructGEP( - ourUnwindExceptionType, - unwindException, - 0)); + llvm::Value *unwindExceptionClass = builder.CreateLoad( + builder.getInt64Ty(), + builder.CreateStructGEP(ourUnwindExceptionType, unwindException, 0)); // Branch to the externalExceptionBlock if the exception is foreign or // to a catch router if not. Either way the finally block will be run. @@ -1275,8 +1274,8 @@ static llvm::Function *createCatchWrappedInvokeFunction( // (OurException instance). // // Note: ourBaseFromUnwindOffset is usually negative - llvm::Value *typeInfoThrown = builder.CreateConstGEP1_64(unwindException, - ourBaseFromUnwindOffset)); + llvm::Value *typeInfoThrown = builder.CreateConstGEP1_64( + builder.getInt8Ty(), unwindException, ourBaseFromUnwindOffset); // Retrieve thrown exception type info type // @@ -1285,17 +1284,14 @@ static llvm::Function *createCatchWrappedInvokeFunction( typeInfoThrown = builder.CreateStructGEP(ourExceptionType, typeInfoThrown, 0); llvm::Value *typeInfoThrownType = - builder.CreateStructGEP(builder.getPtrTy(), typeInfoThrown, 0); + builder.CreateStructGEP(ourTypeInfoType, typeInfoThrown, 0); - generateIntegerPrint(context, - module, - builder, - *toPrint32Int, - *(builder.CreateLoad(typeInfoThrownType)), + llvm::Value *ti32 = + builder.CreateLoad(builder.getInt32Ty(), typeInfoThrownType); + generateIntegerPrint(context, module, builder, *toPrint32Int, ti32, "Gen: Exception type <%d> received (stack unwound) " " in " + - ourId + - ".\n", + ourId + ".\n", USE_GLOBAL_STR_CONSTS); // Route to matched type info catch block or run cleanup finally block @@ -1307,8 +1303,7 @@ static llvm::Function *createCatchWrappedInvokeFunction( for (unsigned i = 1; i <= numExceptionsToCatch; ++i) { nextTypeToCatch = i - 1; - switchToCatchBlock->addCase(llvm::ConstantInt::get( - llvm::Type::getInt32Ty(context), i), + switchToCatchBlock->addCase(llvm::ConstantInt::get(builder.getInt32Ty(), i), catchBlocks[nextTypeToCatch]); } @@ -1383,14 +1378,10 @@ createThrowExceptionFunction(llvm::Module &module, llvm::IRBuilder<> &builder, builder.SetInsertPoint(entryBlock); llvm::Function *toPrint32Int = module.getFunction("print32Int"); - generateIntegerPrint(context, - module, - builder, - *toPrint32Int, - *exceptionType, - "\nGen: About to throw exception type <%d> in " + - ourId + - ".\n", + generateIntegerPrint(context, module, builder, *toPrint32Int, + builder.CreateZExt(exceptionType, builder.getInt32Ty()), + "\nGen: About to throw exception type <%d> in " + ourId + + ".\n", USE_GLOBAL_STR_CONSTS); // Switches on runtime type info type value to determine whether or not @@ -1542,15 +1533,12 @@ typedef void (*OurExceptionThrowFunctType) (int32_t typeToThrow); /// @param function generated test function to run /// @param typeToThrow type info type of generated exception to throw, or /// indicator to cause foreign exception to be thrown. -static -void runExceptionThrow(llvm::ExecutionEngine *engine, - llvm::Function *function, - int32_t typeToThrow) { +static void runExceptionThrow(llvm::orc::LLJIT *JIT, std::string function, + int32_t typeToThrow) { // Find test's function pointer OurExceptionThrowFunctType functPtr = - reinterpret_cast( - reinterpret_cast(engine->getPointerToFunction(function))); + ExitOnErr(JIT->lookup(function)).toPtr(); try { // Run test @@ -1579,8 +1567,6 @@ void runExceptionThrow(llvm::ExecutionEngine *engine, // End test functions // -typedef llvm::ArrayRef TypeArray; - /// This initialization routine creates type info globals and /// adds external function declarations to module. /// @param numTypeInfos number of linear type info associated type info types @@ -1861,7 +1847,8 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos, // llvm.eh.typeid.for intrinsic - getDeclaration(&module, llvm::Intrinsic::eh_typeid_for, builder.getPtrTy()); + getOrInsertDeclaration(&module, llvm::Intrinsic::eh_typeid_for, + builder.getPtrTy()); } @@ -1890,93 +1877,73 @@ int main(int argc, char *argv[]) { return(0); } - // If not set, exception handling will not be turned on - llvm::TargetOptions Opts; - llvm::InitializeNativeTarget(); llvm::InitializeNativeTargetAsmPrinter(); - llvm::LLVMContext Context; - llvm::IRBuilder<> theBuilder(Context); + auto Context = std::make_unique(); + llvm::IRBuilder<> theBuilder(*Context); // Make the module, which holds all the code. std::unique_ptr Owner = - std::make_unique("my cool jit", Context); + std::make_unique("my cool jit", *Context); llvm::Module *module = Owner.get(); - std::unique_ptr MemMgr(new llvm::SectionMemoryManager()); + // Build LLJIT + std::unique_ptr JIT = + ExitOnErr(llvm::orc::LLJITBuilder().create()); - // Build engine with JIT - llvm::EngineBuilder factory(std::move(Owner)); - factory.setEngineKind(llvm::EngineKind::JIT); - factory.setTargetOptions(Opts); - factory.setMCJITMemoryManager(std::move(MemMgr)); - llvm::ExecutionEngine *executionEngine = factory.create(); + // Set up the optimizer pipeline. + llvm::legacy::FunctionPassManager fpm(module); - { - llvm::legacy::FunctionPassManager fpm(module); - - // Set up the optimizer pipeline. - // Start with registering info about how the - // target lays out data structures. - module->setDataLayout(executionEngine->getDataLayout()); - - // Optimizations turned on + // Optimizations turned on #ifdef ADD_OPT_PASSES - // Basic AliasAnslysis support for GVN. - fpm.add(llvm::createBasicAliasAnalysisPass()); + // Basic AliasAnslysis support for GVN. + fpm.add(llvm::createBasicAliasAnalysisPass()); - // Promote allocas to registers. - fpm.add(llvm::createPromoteMemoryToRegisterPass()); + // Promote allocas to registers. + fpm.add(llvm::createPromoteMemoryToRegisterPass()); - // Do simple "peephole" optimizations and bit-twiddling optzns. - fpm.add(llvm::createInstructionCombiningPass()); + // Do simple "peephole" optimizations and bit-twiddling optzns. + fpm.add(llvm::createInstructionCombiningPass()); - // Reassociate expressions. - fpm.add(llvm::createReassociatePass()); + // Reassociate expressions. + fpm.add(llvm::createReassociatePass()); - // Eliminate Common SubExpressions. - fpm.add(llvm::createGVNPass()); + // Eliminate Common SubExpressions. + fpm.add(llvm::createGVNPass()); - // Simplify the control flow graph (deleting unreachable - // blocks, etc). - fpm.add(llvm::createCFGSimplificationPass()); + // Simplify the control flow graph (deleting unreachable + // blocks, etc). + fpm.add(llvm::createCFGSimplificationPass()); #endif // ADD_OPT_PASSES - fpm.doInitialization(); + fpm.doInitialization(); - // Generate test code using function throwCppException(...) as - // the function which throws foreign exceptions. - llvm::Function *toRun = - createUnwindExceptionTest(*module, - theBuilder, - fpm, - "throwCppException"); + // Generate test code using function throwCppException(...) as + // the function which throws foreign exceptions. + createUnwindExceptionTest(*module, theBuilder, fpm, "throwCppException"); - executionEngine->finalizeObject(); + ExitOnErr(JIT->addIRModule( + llvm::orc::ThreadSafeModule(std::move(Owner), std::move(Context)))); #ifndef NDEBUG - fprintf(stderr, "\nBegin module dump:\n\n"); + fprintf(stderr, "\nBegin module dump:\n\n"); - module->dump(); + module->print(llvm::errs(), nullptr); - fprintf(stderr, "\nEnd module dump:\n"); + fprintf(stderr, "\nEnd module dump:\n"); #endif - fprintf(stderr, "\n\nBegin Test:\n"); - - for (int i = 1; i < argc; ++i) { - // Run test for each argument whose value is the exception - // type to throw. - runExceptionThrow(executionEngine, - toRun, - (unsigned) strtoul(argv[i], NULL, 10)); - } + fprintf(stderr, "\n\nBegin Test:\n"); + std::string toRun = "outerCatchFunct"; - fprintf(stderr, "\nEnd Test:\n\n"); + for (int i = 1; i < argc; ++i) { + // Run test for each argument whose value is the exception + // type to throw. + runExceptionThrow(JIT.get(), toRun, (unsigned)strtoul(argv[i], NULL, 10)); } - delete executionEngine; + fprintf(stderr, "\nEnd Test:\n\n"); return 0; } From 23d2a1862a8b60cf5a04ffabdf5c1ea776120d04 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 Jan 2025 08:46:59 +0700 Subject: [PATCH 067/208] PeepholeOpt: Remove unnecessary check for null TargetInstrInfo (#123929) This can never happen. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 5d76d3688dfefa..d56f040cf421fd 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1991,11 +1991,6 @@ ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() { // turn that into an assertion. return ValueTrackerResult(); - if (!TII) - // We could handle the REG_SEQUENCE here, but we do not want to - // duplicate the code from the generic TII. - return ValueTrackerResult(); - SmallVector RegSeqInputRegs; if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs)) return ValueTrackerResult(); @@ -2024,11 +2019,6 @@ ValueTrackerResult ValueTracker::getNextSourceFromInsertSubreg() { // I.e., this may be turned into an assert. return ValueTrackerResult(); - if (!TII) - // We could handle the REG_SEQUENCE here, but we do not want to - // duplicate the code from the generic TII. - return ValueTrackerResult(); - RegSubRegPair BaseReg; RegSubRegPairAndIdx InsertedReg; if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg)) @@ -2078,11 +2068,6 @@ ValueTrackerResult ValueTracker::getNextSourceFromExtractSubreg() { if (DefSubReg) return ValueTrackerResult(); - if (!TII) - // We could handle the EXTRACT_SUBREG here, but we do not want to - // duplicate the code from the generic TII. - return ValueTrackerResult(); - RegSubRegPairAndIdx ExtractSubregInputReg; if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg)) return ValueTrackerResult(); From d3aea77f50a2215a9fa50c1bfa5f4b9717d8e928 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Thu, 23 Jan 2025 09:47:38 +0800 Subject: [PATCH 068/208] [SLP] Move transformMaskAfterShuffle into BaseShuffleAnalysis and use it as much as possible. (#123896) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 47 ++++++------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 961cab33c579f2..fc6bba6d2b8b3b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9487,6 +9487,15 @@ class BaseShuffleAnalysis { return Builder.createShuffleVector(V1, NewMask); return Builder.createIdentity(V1); } + + /// Transforms mask \p CommonMask per given \p Mask to make proper set after + /// shuffle emission. + static void transformMaskAfterShuffle(MutableArrayRef CommonMask, + ArrayRef Mask) { + for (unsigned I : seq(CommonMask.size())) + if (Mask[I] != PoisonMaskElem) + CommonMask[I] = I; + } }; } // namespace @@ -10317,14 +10326,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } return Cost; } - /// Transforms mask \p CommonMask per given \p Mask to make proper set after - /// shuffle emission. - static void transformMaskAfterShuffle(MutableArrayRef CommonMask, - ArrayRef Mask) { - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (Mask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; - } /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given /// mask \p Mask, register number \p Part, that includes \p SliceSize /// elements. @@ -10947,9 +10948,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Cost += createShuffle(Vec, InVectors.back(), CommonMask); else Cost += createShuffle(Vec, nullptr, CommonMask); - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; + transformMaskAfterShuffle(CommonMask, CommonMask); assert(VF > 0 && "Expected vector length for the final value before action."); Value *V = cast(Vec); @@ -10962,9 +10961,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Cost += createShuffle(Vec, InVectors.back(), CommonMask); else Cost += createShuffle(Vec, nullptr, CommonMask); - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; + transformMaskAfterShuffle(CommonMask, CommonMask); // Add subvectors permutation cost. if (!SubVectorsMask.empty()) { assert(SubVectorsMask.size() <= CommonMask.size() && @@ -14214,15 +14211,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { ShuffleBuilder); } - /// Transforms mask \p CommonMask per given \p Mask to make proper set after - /// shuffle emission. - static void transformMaskAfterShuffle(MutableArrayRef CommonMask, - ArrayRef Mask) { - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (Mask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; - } - /// Cast value \p V to the vector type with the same number of elements, but /// the base type \p ScalarTy. Value *castToScalarTyElem(Value *V, @@ -14555,9 +14543,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { } else { Vec = createShuffle(Vec, nullptr, CommonMask); } - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; + transformMaskAfterShuffle(CommonMask, CommonMask); assert(VF > 0 && "Expected vector length for the final value before action."); unsigned VecVF = cast(Vec->getType())->getNumElements(); @@ -14577,9 +14563,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { } else { Vec = createShuffle(Vec, nullptr, CommonMask); } - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; + transformMaskAfterShuffle(CommonMask, CommonMask); auto CreateSubVectors = [&](Value *Vec, SmallVectorImpl &CommonMask) { for (auto [E, Idx] : SubVectors) { @@ -14620,10 +14604,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Value *InsertVec = CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask); Vec = createShuffle(InsertVec, Vec, SVMask); - for (unsigned I : seq(CommonMask.size())) { - if (SVMask[I] != PoisonMaskElem) - CommonMask[I] = I; - } + transformMaskAfterShuffle(CommonMask, SVMask); } InVectors.front() = Vec; } From 1042ddc31b594511657ff70a82d71e2d037e2e35 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 22 Jan 2025 20:52:12 -0500 Subject: [PATCH 069/208] [gn] port ec15b242505 --- llvm/utils/gn/secondary/llvm/test/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn index 33beb0516aba2a..c81b159a45c6ac 100644 --- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn @@ -60,6 +60,7 @@ write_lit_config("lit_site_cfg") { # LLVM_HOST_TRIPLE.) "HOST_LDFLAGS=", + "ENABLE_BACKTRACES=1", "LLVM_APPEND_VC_REV=0", "LLVM_ENABLE_FFI=0", "LLVM_ENABLE_HTTPLIB=0", From 6f69adeed6728e49c16d47bdde658285f49c8ed7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 Jan 2025 08:57:04 +0700 Subject: [PATCH 070/208] PeepholeOpt: Remove null TargetRegisterInfo check (#123933) This cannot happen. Also simplify the LaneBitmask check from !none to any. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index d56f040cf421fd..5fc8f419e80a5d 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -2048,9 +2048,9 @@ ValueTrackerResult ValueTracker::getNextSourceFromInsertSubreg() { // Get the TRI and check if the inserted sub-register overlaps with the // sub-register we are tracking. const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); - if (!TRI || !(TRI->getSubRegIndexLaneMask(DefSubReg) & - TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)) - .none()) + if ((TRI->getSubRegIndexLaneMask(DefSubReg) & + TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)) + .any()) return ValueTrackerResult(); // At this point, the value is available in v0 via the same subreg // we used for Def. From 2646e2d487027e61c4e3cba5ceecfd95cedce0fe Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 Jan 2025 09:00:08 +0700 Subject: [PATCH 071/208] PeepholeOpt: Stop allocating tiny helper classes (NFC) (#123936) This was allocating tiny helper classes for every instruction visited. We can just dispatch over the cases in the visitor function instead. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 1359 ++++++++++++------------ 1 file changed, 675 insertions(+), 684 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 5fc8f419e80a5d..aec4aaa81761c7 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -149,288 +149,584 @@ namespace { class ValueTrackerResult; class RecurrenceInstr; -class PeepholeOptimizer : private MachineFunction::Delegate { - const TargetInstrInfo *TII = nullptr; - const TargetRegisterInfo *TRI = nullptr; - MachineRegisterInfo *MRI = nullptr; - MachineDominatorTree *DT = nullptr; // Machine dominator tree - MachineLoopInfo *MLI = nullptr; - +/// Interface to query instructions amenable to copy rewriting. +class Rewriter { +protected: + MachineInstr &CopyLike; + unsigned CurrentSrcIdx = 0; ///< The index of the source being rewritten. public: - PeepholeOptimizer(MachineDominatorTree *DT, MachineLoopInfo *MLI) - : DT(DT), MLI(MLI) {} - - bool run(MachineFunction &MF); - /// Track Def -> Use info used for rewriting copies. - using RewriteMapTy = SmallDenseMap; - - /// Sequence of instructions that formulate recurrence cycle. - using RecurrenceCycle = SmallVector; - -private: - bool optimizeCmpInstr(MachineInstr &MI); - bool optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB, - SmallPtrSetImpl &LocalMIs); - bool optimizeSelect(MachineInstr &MI, - SmallPtrSetImpl &LocalMIs); - bool optimizeCondBranch(MachineInstr &MI); - bool optimizeCoalescableCopy(MachineInstr &MI); - bool optimizeUncoalescableCopy(MachineInstr &MI, - SmallPtrSetImpl &LocalMIs); - bool optimizeRecurrence(MachineInstr &PHI); - bool findNextSource(RegSubRegPair RegSubReg, RewriteMapTy &RewriteMap); - bool isMoveImmediate(MachineInstr &MI, SmallSet &ImmDefRegs, - DenseMap &ImmDefMIs); - bool foldImmediate(MachineInstr &MI, SmallSet &ImmDefRegs, - DenseMap &ImmDefMIs, - bool &Deleted); - - /// Finds recurrence cycles, but only ones that formulated around - /// a def operand and a use operand that are tied. If there is a use - /// operand commutable with the tied use operand, find recurrence cycle - /// along that operand as well. - bool findTargetRecurrence(Register Reg, - const SmallSet &TargetReg, - RecurrenceCycle &RC); - - /// If copy instruction \p MI is a virtual register copy or a copy of a - /// constant physical register to a virtual register, track it in the - /// set CopySrcMIs. If this virtual register was previously seen as a - /// copy, replace the uses of this copy with the previously seen copy's - /// destination register. - bool foldRedundantCopy(MachineInstr &MI); - - /// Is the register \p Reg a non-allocatable physical register? - bool isNAPhysCopy(Register Reg); + Rewriter(MachineInstr &CopyLike) : CopyLike(CopyLike) {} + virtual ~Rewriter() = default; - /// If copy instruction \p MI is a non-allocatable virtual<->physical - /// register copy, track it in the \p NAPhysToVirtMIs map. If this - /// non-allocatable physical register was previously copied to a virtual - /// registered and hasn't been clobbered, the virt->phys copy can be - /// deleted. - bool - foldRedundantNAPhysCopy(MachineInstr &MI, - DenseMap &NAPhysToVirtMIs); + /// Get the next rewritable source (SrcReg, SrcSubReg) and + /// the related value that it affects (DstReg, DstSubReg). + /// A source is considered rewritable if its register class and the + /// register class of the related DstReg may not be register + /// coalescer friendly. In other words, given a copy-like instruction + /// not all the arguments may be returned at rewritable source, since + /// some arguments are none to be register coalescer friendly. + /// + /// Each call of this method moves the current source to the next + /// rewritable source. + /// For instance, let CopyLike be the instruction to rewrite. + /// CopyLike has one definition and one source: + /// dst.dstSubIdx = CopyLike src.srcSubIdx. + /// + /// The first call will give the first rewritable source, i.e., + /// the only source this instruction has: + /// (SrcReg, SrcSubReg) = (src, srcSubIdx). + /// This source defines the whole definition, i.e., + /// (DstReg, DstSubReg) = (dst, dstSubIdx). + /// + /// The second and subsequent calls will return false, as there is only one + /// rewritable source. + /// + /// \return True if a rewritable source has been found, false otherwise. + /// The output arguments are valid if and only if true is returned. + virtual bool getNextRewritableSource(RegSubRegPair &Src, + RegSubRegPair &Dst) = 0; - bool isLoadFoldable(MachineInstr &MI, - SmallSet &FoldAsLoadDefCandidates); + /// Rewrite the current source with \p NewReg and \p NewSubReg if possible. + /// \return True if the rewriting was possible, false otherwise. + virtual bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) = 0; +}; - /// Check whether \p MI is understood by the register coalescer - /// but may require some rewriting. - bool isCoalescableCopy(const MachineInstr &MI) { - // SubregToRegs are not interesting, because they are already register - // coalescer friendly. - return MI.isCopy() || - (!DisableAdvCopyOpt && (MI.isRegSequence() || MI.isInsertSubreg() || - MI.isExtractSubreg())); +/// Rewriter for COPY instructions. +class CopyRewriter : public Rewriter { +public: + CopyRewriter(MachineInstr &MI) : Rewriter(MI) { + assert(MI.isCopy() && "Expected copy instruction"); } + virtual ~CopyRewriter() = default; - /// Check whether \p MI is a copy like instruction that is - /// not recognized by the register coalescer. - bool isUncoalescableCopy(const MachineInstr &MI) { - return MI.isBitcast() || (!DisableAdvCopyOpt && (MI.isRegSequenceLike() || - MI.isInsertSubregLike() || - MI.isExtractSubregLike())); + bool getNextRewritableSource(RegSubRegPair &Src, + RegSubRegPair &Dst) override { + // CurrentSrcIdx > 0 means this function has already been called. + if (CurrentSrcIdx > 0) + return false; + // This is the first call to getNextRewritableSource. + // Move the CurrentSrcIdx to remember that we made that call. + CurrentSrcIdx = 1; + // The rewritable source is the argument. + const MachineOperand &MOSrc = CopyLike.getOperand(1); + Src = RegSubRegPair(MOSrc.getReg(), MOSrc.getSubReg()); + // What we track are the alternative sources of the definition. + const MachineOperand &MODef = CopyLike.getOperand(0); + Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg()); + return true; } - MachineInstr &rewriteSource(MachineInstr &CopyLike, RegSubRegPair Def, - RewriteMapTy &RewriteMap); - - // Set of copies to virtual registers keyed by source register. Never - // holds any physreg which requires def tracking. - DenseMap CopySrcMIs; + bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { + if (CurrentSrcIdx != 1) + return false; + MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx); + MOSrc.setReg(NewReg); + MOSrc.setSubReg(NewSubReg); + return true; + } +}; - // MachineFunction::Delegate implementation. Used to maintain CopySrcMIs. - void MF_HandleInsertion(MachineInstr &MI) override { return; } +/// Helper class to rewrite uncoalescable copy like instructions +/// into new COPY (coalescable friendly) instructions. +class UncoalescableRewriter : public Rewriter { + unsigned NumDefs; ///< Number of defs in the bitcast. - bool getCopySrc(MachineInstr &MI, RegSubRegPair &SrcPair) { - if (!MI.isCopy()) - return false; +public: + UncoalescableRewriter(MachineInstr &MI) : Rewriter(MI) { + NumDefs = MI.getDesc().getNumDefs(); + } - Register SrcReg = MI.getOperand(1).getReg(); - unsigned SrcSubReg = MI.getOperand(1).getSubReg(); - if (!SrcReg.isVirtual() && !MRI->isConstantPhysReg(SrcReg)) + /// \see See Rewriter::getNextRewritableSource() + /// All such sources need to be considered rewritable in order to + /// rewrite a uncoalescable copy-like instruction. This method return + /// each definition that must be checked if rewritable. + bool getNextRewritableSource(RegSubRegPair &Src, + RegSubRegPair &Dst) override { + // Find the next non-dead definition and continue from there. + if (CurrentSrcIdx == NumDefs) return false; - SrcPair = RegSubRegPair(SrcReg, SrcSubReg); - return true; - } + while (CopyLike.getOperand(CurrentSrcIdx).isDead()) { + ++CurrentSrcIdx; + if (CurrentSrcIdx == NumDefs) + return false; + } - // If a COPY instruction is to be deleted or changed, we should also remove - // it from CopySrcMIs. - void deleteChangedCopy(MachineInstr &MI) { - RegSubRegPair SrcPair; - if (!getCopySrc(MI, SrcPair)) - return; + // What we track are the alternative sources of the definition. + Src = RegSubRegPair(0, 0); + const MachineOperand &MODef = CopyLike.getOperand(CurrentSrcIdx); + Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg()); - auto It = CopySrcMIs.find(SrcPair); - if (It != CopySrcMIs.end() && It->second == &MI) - CopySrcMIs.erase(It); + CurrentSrcIdx++; + return true; } - void MF_HandleRemoval(MachineInstr &MI) override { deleteChangedCopy(MI); } - - void MF_HandleChangeDesc(MachineInstr &MI, const MCInstrDesc &TID) override { - deleteChangedCopy(MI); + bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { + return false; } }; -class PeepholeOptimizerLegacy : public MachineFunctionPass { +/// Specialized rewriter for INSERT_SUBREG instruction. +class InsertSubregRewriter : public Rewriter { public: - static char ID; // Pass identification - - PeepholeOptimizerLegacy() : MachineFunctionPass(ID) { - initializePeepholeOptimizerLegacyPass(*PassRegistry::getPassRegistry()); + InsertSubregRewriter(MachineInstr &MI) : Rewriter(MI) { + assert(MI.isInsertSubreg() && "Invalid instruction"); } - bool runOnMachineFunction(MachineFunction &MF) override; + /// \see See Rewriter::getNextRewritableSource() + /// Here CopyLike has the following form: + /// dst = INSERT_SUBREG Src1, Src2.src2SubIdx, subIdx. + /// Src1 has the same register class has dst, hence, there is + /// nothing to rewrite. + /// Src2.src2SubIdx, may not be register coalescer friendly. + /// Therefore, the first call to this method returns: + /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx). + /// (DstReg, DstSubReg) = (dst, subIdx). + /// + /// Subsequence calls will return false. + bool getNextRewritableSource(RegSubRegPair &Src, + RegSubRegPair &Dst) override { + // If we already get the only source we can rewrite, return false. + if (CurrentSrcIdx == 2) + return false; + // We are looking at v2 = INSERT_SUBREG v0, v1, sub0. + CurrentSrcIdx = 2; + const MachineOperand &MOInsertedReg = CopyLike.getOperand(2); + Src = RegSubRegPair(MOInsertedReg.getReg(), MOInsertedReg.getSubReg()); + const MachineOperand &MODef = CopyLike.getOperand(0); - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); - if (Aggressive) { - AU.addRequired(); - AU.addPreserved(); - } + // We want to track something that is compatible with the + // partial definition. + if (MODef.getSubReg()) + // Bail if we have to compose sub-register indices. + return false; + Dst = RegSubRegPair(MODef.getReg(), + (unsigned)CopyLike.getOperand(3).getImm()); + return true; } - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::IsSSA); - } -}; - -/// Helper class to hold instructions that are inside recurrence cycles. -/// The recurrence cycle is formulated around 1) a def operand and its -/// tied use operand, or 2) a def operand and a use operand that is commutable -/// with another use operand which is tied to the def operand. In the latter -/// case, index of the tied use operand and the commutable use operand are -/// maintained with CommutePair. -class RecurrenceInstr { -public: - using IndexPair = std::pair; - - RecurrenceInstr(MachineInstr *MI) : MI(MI) {} - RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2) - : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {} - - MachineInstr *getMI() const { return MI; } - std::optional getCommutePair() const { return CommutePair; } - -private: - MachineInstr *MI; - std::optional CommutePair; + bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { + if (CurrentSrcIdx != 2) + return false; + // We are rewriting the inserted reg. + MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx); + MO.setReg(NewReg); + MO.setSubReg(NewSubReg); + return true; + } }; -/// Helper class to hold a reply for ValueTracker queries. -/// Contains the returned sources for a given search and the instructions -/// where the sources were tracked from. -class ValueTrackerResult { -private: - /// Track all sources found by one ValueTracker query. - SmallVector RegSrcs; - - /// Instruction using the sources in 'RegSrcs'. - const MachineInstr *Inst = nullptr; +/// Specialized rewriter for EXTRACT_SUBREG instruction. +class ExtractSubregRewriter : public Rewriter { + const TargetInstrInfo &TII; public: - ValueTrackerResult() = default; - - ValueTrackerResult(Register Reg, unsigned SubReg) { addSource(Reg, SubReg); } + ExtractSubregRewriter(MachineInstr &MI, const TargetInstrInfo &TII) + : Rewriter(MI), TII(TII) { + assert(MI.isExtractSubreg() && "Invalid instruction"); + } - bool isValid() const { return getNumSources() > 0; } + /// \see Rewriter::getNextRewritableSource() + /// Here CopyLike has the following form: + /// dst.dstSubIdx = EXTRACT_SUBREG Src, subIdx. + /// There is only one rewritable source: Src.subIdx, + /// which defines dst.dstSubIdx. + bool getNextRewritableSource(RegSubRegPair &Src, + RegSubRegPair &Dst) override { + // If we already get the only source we can rewrite, return false. + if (CurrentSrcIdx == 1) + return false; + // We are looking at v1 = EXTRACT_SUBREG v0, sub0. + CurrentSrcIdx = 1; + const MachineOperand &MOExtractedReg = CopyLike.getOperand(1); + // If we have to compose sub-register indices, bail out. + if (MOExtractedReg.getSubReg()) + return false; - void setInst(const MachineInstr *I) { Inst = I; } - const MachineInstr *getInst() const { return Inst; } + Src = + RegSubRegPair(MOExtractedReg.getReg(), CopyLike.getOperand(2).getImm()); - void clear() { - RegSrcs.clear(); - Inst = nullptr; + // We want to track something that is compatible with the definition. + const MachineOperand &MODef = CopyLike.getOperand(0); + Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg()); + return true; } - void addSource(Register SrcReg, unsigned SrcSubReg) { - RegSrcs.push_back(RegSubRegPair(SrcReg, SrcSubReg)); + bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { + // The only source we can rewrite is the input register. + if (CurrentSrcIdx != 1) + return false; + + CopyLike.getOperand(CurrentSrcIdx).setReg(NewReg); + + // If we find a source that does not require to extract something, + // rewrite the operation with a copy. + if (!NewSubReg) { + // Move the current index to an invalid position. + // We do not want another call to this method to be able + // to do any change. + CurrentSrcIdx = -1; + // Rewrite the operation as a COPY. + // Get rid of the sub-register index. + CopyLike.removeOperand(2); + // Morph the operation into a COPY. + CopyLike.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } + CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg); + return true; } +}; - void setSource(int Idx, Register SrcReg, unsigned SrcSubReg) { - assert(Idx < getNumSources() && "Reg pair source out of index"); - RegSrcs[Idx] = RegSubRegPair(SrcReg, SrcSubReg); +/// Specialized rewriter for REG_SEQUENCE instruction. +class RegSequenceRewriter : public Rewriter { +public: + RegSequenceRewriter(MachineInstr &MI) : Rewriter(MI) { + assert(MI.isRegSequence() && "Invalid instruction"); } - int getNumSources() const { return RegSrcs.size(); } + /// \see Rewriter::getNextRewritableSource() + /// Here CopyLike has the following form: + /// dst = REG_SEQUENCE Src1.src1SubIdx, subIdx1, Src2.src2SubIdx, subIdx2. + /// Each call will return a different source, walking all the available + /// source. + /// + /// The first call returns: + /// (SrcReg, SrcSubReg) = (Src1, src1SubIdx). + /// (DstReg, DstSubReg) = (dst, subIdx1). + /// + /// The second call returns: + /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx). + /// (DstReg, DstSubReg) = (dst, subIdx2). + /// + /// And so on, until all the sources have been traversed, then + /// it returns false. + bool getNextRewritableSource(RegSubRegPair &Src, + RegSubRegPair &Dst) override { + // We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc. - RegSubRegPair getSrc(int Idx) const { return RegSrcs[Idx]; } + // If this is the first call, move to the first argument. + if (CurrentSrcIdx == 0) { + CurrentSrcIdx = 1; + } else { + // Otherwise, move to the next argument and check that it is valid. + CurrentSrcIdx += 2; + if (CurrentSrcIdx >= CopyLike.getNumOperands()) + return false; + } + const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx); + Src.Reg = MOInsertedReg.getReg(); + // If we have to compose sub-register indices, bail out. + if ((Src.SubReg = MOInsertedReg.getSubReg())) + return false; - Register getSrcReg(int Idx) const { - assert(Idx < getNumSources() && "Reg source out of index"); - return RegSrcs[Idx].Reg; - } + // We want to track something that is compatible with the related + // partial definition. + Dst.SubReg = CopyLike.getOperand(CurrentSrcIdx + 1).getImm(); - unsigned getSrcSubReg(int Idx) const { - assert(Idx < getNumSources() && "SubReg source out of index"); - return RegSrcs[Idx].SubReg; + const MachineOperand &MODef = CopyLike.getOperand(0); + Dst.Reg = MODef.getReg(); + // If we have to compose sub-registers, bail. + return MODef.getSubReg() == 0; } - bool operator==(const ValueTrackerResult &Other) const { - if (Other.getInst() != getInst()) - return false; - - if (Other.getNumSources() != getNumSources()) + bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { + // We cannot rewrite out of bound operands. + // Moreover, rewritable sources are at odd positions. + if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands()) return false; - for (int i = 0, e = Other.getNumSources(); i != e; ++i) - if (Other.getSrcReg(i) != getSrcReg(i) || - Other.getSrcSubReg(i) != getSrcSubReg(i)) - return false; + MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx); + MO.setReg(NewReg); + MO.setSubReg(NewSubReg); return true; } }; -/// Helper class to track the possible sources of a value defined by -/// a (chain of) copy related instructions. -/// Given a definition (instruction and definition index), this class -/// follows the use-def chain to find successive suitable sources. -/// The given source can be used to rewrite the definition into -/// def = COPY src. -/// -/// For instance, let us consider the following snippet: -/// v0 = -/// v2 = INSERT_SUBREG v1, v0, sub0 -/// def = COPY v2.sub0 -/// -/// Using a ValueTracker for def = COPY v2.sub0 will give the following -/// suitable sources: -/// v2.sub0 and v0. -/// Then, def can be rewritten into def = COPY v0. -class ValueTracker { -private: - /// The current point into the use-def chain. - const MachineInstr *Def = nullptr; +class PeepholeOptimizer : private MachineFunction::Delegate { + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; + MachineDominatorTree *DT = nullptr; // Machine dominator tree + MachineLoopInfo *MLI = nullptr; - /// The index of the definition in Def. - unsigned DefIdx = 0; +public: + PeepholeOptimizer(MachineDominatorTree *DT, MachineLoopInfo *MLI) + : DT(DT), MLI(MLI) {} - /// The sub register index of the definition. - unsigned DefSubReg; + bool run(MachineFunction &MF); + /// Track Def -> Use info used for rewriting copies. + using RewriteMapTy = SmallDenseMap; - /// The register where the value can be found. - Register Reg; + /// Sequence of instructions that formulate recurrence cycle. + using RecurrenceCycle = SmallVector; - /// MachineRegisterInfo used to perform tracking. - const MachineRegisterInfo &MRI; +private: + bool optimizeCmpInstr(MachineInstr &MI); + bool optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB, + SmallPtrSetImpl &LocalMIs); + bool optimizeSelect(MachineInstr &MI, + SmallPtrSetImpl &LocalMIs); + bool optimizeCondBranch(MachineInstr &MI); - /// Optional TargetInstrInfo used to perform some complex tracking. - const TargetInstrInfo *TII; + bool optimizeCoalescableCopyImpl(Rewriter &&CpyRewriter); + bool optimizeCoalescableCopy(MachineInstr &MI); + bool optimizeUncoalescableCopy(MachineInstr &MI, + SmallPtrSetImpl &LocalMIs); + bool optimizeRecurrence(MachineInstr &PHI); + bool findNextSource(RegSubRegPair RegSubReg, RewriteMapTy &RewriteMap); + bool isMoveImmediate(MachineInstr &MI, SmallSet &ImmDefRegs, + DenseMap &ImmDefMIs); + bool foldImmediate(MachineInstr &MI, SmallSet &ImmDefRegs, + DenseMap &ImmDefMIs, + bool &Deleted); - /// Dispatcher to the right underlying implementation of getNextSource. - ValueTrackerResult getNextSourceImpl(); + /// Finds recurrence cycles, but only ones that formulated around + /// a def operand and a use operand that are tied. If there is a use + /// operand commutable with the tied use operand, find recurrence cycle + /// along that operand as well. + bool findTargetRecurrence(Register Reg, + const SmallSet &TargetReg, + RecurrenceCycle &RC); - /// Specialized version of getNextSource for Copy instructions. - ValueTrackerResult getNextSourceFromCopy(); + /// If copy instruction \p MI is a virtual register copy or a copy of a + /// constant physical register to a virtual register, track it in the + /// set CopySrcMIs. If this virtual register was previously seen as a + /// copy, replace the uses of this copy with the previously seen copy's + /// destination register. + bool foldRedundantCopy(MachineInstr &MI); - /// Specialized version of getNextSource for Bitcast instructions. + /// Is the register \p Reg a non-allocatable physical register? + bool isNAPhysCopy(Register Reg); + + /// If copy instruction \p MI is a non-allocatable virtual<->physical + /// register copy, track it in the \p NAPhysToVirtMIs map. If this + /// non-allocatable physical register was previously copied to a virtual + /// registered and hasn't been clobbered, the virt->phys copy can be + /// deleted. + bool + foldRedundantNAPhysCopy(MachineInstr &MI, + DenseMap &NAPhysToVirtMIs); + + bool isLoadFoldable(MachineInstr &MI, + SmallSet &FoldAsLoadDefCandidates); + + /// Check whether \p MI is understood by the register coalescer + /// but may require some rewriting. + bool isCoalescableCopy(const MachineInstr &MI) { + // SubregToRegs are not interesting, because they are already register + // coalescer friendly. + return MI.isCopy() || + (!DisableAdvCopyOpt && (MI.isRegSequence() || MI.isInsertSubreg() || + MI.isExtractSubreg())); + } + + /// Check whether \p MI is a copy like instruction that is + /// not recognized by the register coalescer. + bool isUncoalescableCopy(const MachineInstr &MI) { + return MI.isBitcast() || (!DisableAdvCopyOpt && (MI.isRegSequenceLike() || + MI.isInsertSubregLike() || + MI.isExtractSubregLike())); + } + + MachineInstr &rewriteSource(MachineInstr &CopyLike, RegSubRegPair Def, + RewriteMapTy &RewriteMap); + + // Set of copies to virtual registers keyed by source register. Never + // holds any physreg which requires def tracking. + DenseMap CopySrcMIs; + + // MachineFunction::Delegate implementation. Used to maintain CopySrcMIs. + void MF_HandleInsertion(MachineInstr &MI) override { return; } + + bool getCopySrc(MachineInstr &MI, RegSubRegPair &SrcPair) { + if (!MI.isCopy()) + return false; + + Register SrcReg = MI.getOperand(1).getReg(); + unsigned SrcSubReg = MI.getOperand(1).getSubReg(); + if (!SrcReg.isVirtual() && !MRI->isConstantPhysReg(SrcReg)) + return false; + + SrcPair = RegSubRegPair(SrcReg, SrcSubReg); + return true; + } + + // If a COPY instruction is to be deleted or changed, we should also remove + // it from CopySrcMIs. + void deleteChangedCopy(MachineInstr &MI) { + RegSubRegPair SrcPair; + if (!getCopySrc(MI, SrcPair)) + return; + + auto It = CopySrcMIs.find(SrcPair); + if (It != CopySrcMIs.end() && It->second == &MI) + CopySrcMIs.erase(It); + } + + void MF_HandleRemoval(MachineInstr &MI) override { deleteChangedCopy(MI); } + + void MF_HandleChangeDesc(MachineInstr &MI, const MCInstrDesc &TID) override { + deleteChangedCopy(MI); + } +}; + +class PeepholeOptimizerLegacy : public MachineFunctionPass { +public: + static char ID; // Pass identification + + PeepholeOptimizerLegacy() : MachineFunctionPass(ID) { + initializePeepholeOptimizerLegacyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + if (Aggressive) { + AU.addRequired(); + AU.addPreserved(); + } + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; + +/// Helper class to hold instructions that are inside recurrence cycles. +/// The recurrence cycle is formulated around 1) a def operand and its +/// tied use operand, or 2) a def operand and a use operand that is commutable +/// with another use operand which is tied to the def operand. In the latter +/// case, index of the tied use operand and the commutable use operand are +/// maintained with CommutePair. +class RecurrenceInstr { +public: + using IndexPair = std::pair; + + RecurrenceInstr(MachineInstr *MI) : MI(MI) {} + RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2) + : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {} + + MachineInstr *getMI() const { return MI; } + std::optional getCommutePair() const { return CommutePair; } + +private: + MachineInstr *MI; + std::optional CommutePair; +}; + +/// Helper class to hold a reply for ValueTracker queries. +/// Contains the returned sources for a given search and the instructions +/// where the sources were tracked from. +class ValueTrackerResult { +private: + /// Track all sources found by one ValueTracker query. + SmallVector RegSrcs; + + /// Instruction using the sources in 'RegSrcs'. + const MachineInstr *Inst = nullptr; + +public: + ValueTrackerResult() = default; + + ValueTrackerResult(Register Reg, unsigned SubReg) { addSource(Reg, SubReg); } + + bool isValid() const { return getNumSources() > 0; } + + void setInst(const MachineInstr *I) { Inst = I; } + const MachineInstr *getInst() const { return Inst; } + + void clear() { + RegSrcs.clear(); + Inst = nullptr; + } + + void addSource(Register SrcReg, unsigned SrcSubReg) { + RegSrcs.push_back(RegSubRegPair(SrcReg, SrcSubReg)); + } + + void setSource(int Idx, Register SrcReg, unsigned SrcSubReg) { + assert(Idx < getNumSources() && "Reg pair source out of index"); + RegSrcs[Idx] = RegSubRegPair(SrcReg, SrcSubReg); + } + + int getNumSources() const { return RegSrcs.size(); } + + RegSubRegPair getSrc(int Idx) const { return RegSrcs[Idx]; } + + Register getSrcReg(int Idx) const { + assert(Idx < getNumSources() && "Reg source out of index"); + return RegSrcs[Idx].Reg; + } + + unsigned getSrcSubReg(int Idx) const { + assert(Idx < getNumSources() && "SubReg source out of index"); + return RegSrcs[Idx].SubReg; + } + + bool operator==(const ValueTrackerResult &Other) const { + if (Other.getInst() != getInst()) + return false; + + if (Other.getNumSources() != getNumSources()) + return false; + + for (int i = 0, e = Other.getNumSources(); i != e; ++i) + if (Other.getSrcReg(i) != getSrcReg(i) || + Other.getSrcSubReg(i) != getSrcSubReg(i)) + return false; + return true; + } +}; + +/// Helper class to track the possible sources of a value defined by +/// a (chain of) copy related instructions. +/// Given a definition (instruction and definition index), this class +/// follows the use-def chain to find successive suitable sources. +/// The given source can be used to rewrite the definition into +/// def = COPY src. +/// +/// For instance, let us consider the following snippet: +/// v0 = +/// v2 = INSERT_SUBREG v1, v0, sub0 +/// def = COPY v2.sub0 +/// +/// Using a ValueTracker for def = COPY v2.sub0 will give the following +/// suitable sources: +/// v2.sub0 and v0. +/// Then, def can be rewritten into def = COPY v0. +class ValueTracker { +private: + /// The current point into the use-def chain. + const MachineInstr *Def = nullptr; + + /// The index of the definition in Def. + unsigned DefIdx = 0; + + /// The sub register index of the definition. + unsigned DefSubReg; + + /// The register where the value can be found. + Register Reg; + + /// MachineRegisterInfo used to perform tracking. + const MachineRegisterInfo &MRI; + + /// Optional TargetInstrInfo used to perform some complex tracking. + const TargetInstrInfo *TII; + + /// Dispatcher to the right underlying implementation of getNextSource. + ValueTrackerResult getNextSourceImpl(); + + /// Specialized version of getNextSource for Copy instructions. + ValueTrackerResult getNextSourceFromCopy(); + + /// Specialized version of getNextSource for Bitcast instructions. ValueTrackerResult getNextSourceFromBitcast(); /// Specialized version of getNextSource for RegSequence instructions. @@ -707,457 +1003,136 @@ bool PeepholeOptimizer::optimizeCondBranch(MachineInstr &MI) { /// Try to find the next source that share the same register file /// for the value defined by \p Reg and \p SubReg. /// When true is returned, the \p RewriteMap can be used by the client to -/// retrieve all Def -> Use along the way up to the next source. Any found -/// Use that is not itself a key for another entry, is the next source to -/// use. During the search for the next source, multiple sources can be found -/// given multiple incoming sources of a PHI instruction. In this case, we -/// look in each PHI source for the next source; all found next sources must -/// share the same register file as \p Reg and \p SubReg. The client should -/// then be capable to rewrite all intermediate PHIs to get the next source. -/// \return False if no alternative sources are available. True otherwise. -bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg, - RewriteMapTy &RewriteMap) { - // Do not try to find a new source for a physical register. - // So far we do not have any motivating example for doing that. - // Thus, instead of maintaining untested code, we will revisit that if - // that changes at some point. - Register Reg = RegSubReg.Reg; - if (Reg.isPhysical()) - return false; - const TargetRegisterClass *DefRC = MRI->getRegClass(Reg); - - SmallVector SrcToLook; - RegSubRegPair CurSrcPair = RegSubReg; - SrcToLook.push_back(CurSrcPair); - - unsigned PHICount = 0; - do { - CurSrcPair = SrcToLook.pop_back_val(); - // As explained above, do not handle physical registers - if (CurSrcPair.Reg.isPhysical()) - return false; - - ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, TII); - - // Follow the chain of copies until we find a more suitable source, a phi - // or have to abort. - while (true) { - ValueTrackerResult Res = ValTracker.getNextSource(); - // Abort at the end of a chain (without finding a suitable source). - if (!Res.isValid()) - return false; - - // Insert the Def -> Use entry for the recently found source. - ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair); - if (CurSrcRes.isValid()) { - assert(CurSrcRes == Res && "ValueTrackerResult found must match"); - // An existent entry with multiple sources is a PHI cycle we must avoid. - // Otherwise it's an entry with a valid next source we already found. - if (CurSrcRes.getNumSources() > 1) { - LLVM_DEBUG(dbgs() - << "findNextSource: found PHI cycle, aborting...\n"); - return false; - } - break; - } - RewriteMap.insert(std::make_pair(CurSrcPair, Res)); - - // ValueTrackerResult usually have one source unless it's the result from - // a PHI instruction. Add the found PHI edges to be looked up further. - unsigned NumSrcs = Res.getNumSources(); - if (NumSrcs > 1) { - PHICount++; - if (PHICount >= RewritePHILimit) { - LLVM_DEBUG(dbgs() << "findNextSource: PHI limit reached\n"); - return false; - } - - for (unsigned i = 0; i < NumSrcs; ++i) - SrcToLook.push_back(Res.getSrc(i)); - break; - } - - CurSrcPair = Res.getSrc(0); - // Do not extend the live-ranges of physical registers as they add - // constraints to the register allocator. Moreover, if we want to extend - // the live-range of a physical register, unlike SSA virtual register, - // we will have to check that they aren't redefine before the related use. - if (CurSrcPair.Reg.isPhysical()) - return false; - - // Keep following the chain if the value isn't any better yet. - const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); - if (!TRI->shouldRewriteCopySrc(DefRC, RegSubReg.SubReg, SrcRC, - CurSrcPair.SubReg)) - continue; - - // We currently cannot deal with subreg operands on PHI instructions - // (see insertPHI()). - if (PHICount > 0 && CurSrcPair.SubReg != 0) - continue; - - // We found a suitable source, and are done with this chain. - break; - } - } while (!SrcToLook.empty()); - - // If we did not find a more suitable source, there is nothing to optimize. - return CurSrcPair.Reg != Reg; -} - -/// Insert a PHI instruction with incoming edges \p SrcRegs that are -/// guaranteed to have the same register class. This is necessary whenever we -/// successfully traverse a PHI instruction and find suitable sources coming -/// from its edges. By inserting a new PHI, we provide a rewritten PHI def -/// suitable to be used in a new COPY instruction. -static MachineInstr &insertPHI(MachineRegisterInfo &MRI, - const TargetInstrInfo &TII, - const SmallVectorImpl &SrcRegs, - MachineInstr &OrigPHI) { - assert(!SrcRegs.empty() && "No sources to create a PHI instruction?"); - - const TargetRegisterClass *NewRC = MRI.getRegClass(SrcRegs[0].Reg); - // NewRC is only correct if no subregisters are involved. findNextSource() - // should have rejected those cases already. - assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand"); - Register NewVR = MRI.createVirtualRegister(NewRC); - MachineBasicBlock *MBB = OrigPHI.getParent(); - MachineInstrBuilder MIB = BuildMI(*MBB, &OrigPHI, OrigPHI.getDebugLoc(), - TII.get(TargetOpcode::PHI), NewVR); - - unsigned MBBOpIdx = 2; - for (const RegSubRegPair &RegPair : SrcRegs) { - MIB.addReg(RegPair.Reg, 0, RegPair.SubReg); - MIB.addMBB(OrigPHI.getOperand(MBBOpIdx).getMBB()); - // Since we're extended the lifetime of RegPair.Reg, clear the - // kill flags to account for that and make RegPair.Reg reaches - // the new PHI. - MRI.clearKillFlags(RegPair.Reg); - MBBOpIdx += 2; - } - - return *MIB; -} - -namespace { - -/// Interface to query instructions amenable to copy rewriting. -class Rewriter { -protected: - MachineInstr &CopyLike; - unsigned CurrentSrcIdx = 0; ///< The index of the source being rewritten. -public: - Rewriter(MachineInstr &CopyLike) : CopyLike(CopyLike) {} - virtual ~Rewriter() = default; - - /// Get the next rewritable source (SrcReg, SrcSubReg) and - /// the related value that it affects (DstReg, DstSubReg). - /// A source is considered rewritable if its register class and the - /// register class of the related DstReg may not be register - /// coalescer friendly. In other words, given a copy-like instruction - /// not all the arguments may be returned at rewritable source, since - /// some arguments are none to be register coalescer friendly. - /// - /// Each call of this method moves the current source to the next - /// rewritable source. - /// For instance, let CopyLike be the instruction to rewrite. - /// CopyLike has one definition and one source: - /// dst.dstSubIdx = CopyLike src.srcSubIdx. - /// - /// The first call will give the first rewritable source, i.e., - /// the only source this instruction has: - /// (SrcReg, SrcSubReg) = (src, srcSubIdx). - /// This source defines the whole definition, i.e., - /// (DstReg, DstSubReg) = (dst, dstSubIdx). - /// - /// The second and subsequent calls will return false, as there is only one - /// rewritable source. - /// - /// \return True if a rewritable source has been found, false otherwise. - /// The output arguments are valid if and only if true is returned. - virtual bool getNextRewritableSource(RegSubRegPair &Src, - RegSubRegPair &Dst) = 0; - - /// Rewrite the current source with \p NewReg and \p NewSubReg if possible. - /// \return True if the rewriting was possible, false otherwise. - virtual bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) = 0; -}; - -/// Rewriter for COPY instructions. -class CopyRewriter : public Rewriter { -public: - CopyRewriter(MachineInstr &MI) : Rewriter(MI) { - assert(MI.isCopy() && "Expected copy instruction"); - } - virtual ~CopyRewriter() = default; - - bool getNextRewritableSource(RegSubRegPair &Src, - RegSubRegPair &Dst) override { - // CurrentSrcIdx > 0 means this function has already been called. - if (CurrentSrcIdx > 0) - return false; - // This is the first call to getNextRewritableSource. - // Move the CurrentSrcIdx to remember that we made that call. - CurrentSrcIdx = 1; - // The rewritable source is the argument. - const MachineOperand &MOSrc = CopyLike.getOperand(1); - Src = RegSubRegPair(MOSrc.getReg(), MOSrc.getSubReg()); - // What we track are the alternative sources of the definition. - const MachineOperand &MODef = CopyLike.getOperand(0); - Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg()); - return true; - } - - bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { - if (CurrentSrcIdx != 1) - return false; - MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx); - MOSrc.setReg(NewReg); - MOSrc.setSubReg(NewSubReg); - return true; - } -}; - -/// Helper class to rewrite uncoalescable copy like instructions -/// into new COPY (coalescable friendly) instructions. -class UncoalescableRewriter : public Rewriter { - unsigned NumDefs; ///< Number of defs in the bitcast. - -public: - UncoalescableRewriter(MachineInstr &MI) : Rewriter(MI) { - NumDefs = MI.getDesc().getNumDefs(); - } - - /// \see See Rewriter::getNextRewritableSource() - /// All such sources need to be considered rewritable in order to - /// rewrite a uncoalescable copy-like instruction. This method return - /// each definition that must be checked if rewritable. - bool getNextRewritableSource(RegSubRegPair &Src, - RegSubRegPair &Dst) override { - // Find the next non-dead definition and continue from there. - if (CurrentSrcIdx == NumDefs) - return false; - - while (CopyLike.getOperand(CurrentSrcIdx).isDead()) { - ++CurrentSrcIdx; - if (CurrentSrcIdx == NumDefs) - return false; - } - - // What we track are the alternative sources of the definition. - Src = RegSubRegPair(0, 0); - const MachineOperand &MODef = CopyLike.getOperand(CurrentSrcIdx); - Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg()); - - CurrentSrcIdx++; - return true; - } - - bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { - return false; - } -}; - -/// Specialized rewriter for INSERT_SUBREG instruction. -class InsertSubregRewriter : public Rewriter { -public: - InsertSubregRewriter(MachineInstr &MI) : Rewriter(MI) { - assert(MI.isInsertSubreg() && "Invalid instruction"); - } - - /// \see See Rewriter::getNextRewritableSource() - /// Here CopyLike has the following form: - /// dst = INSERT_SUBREG Src1, Src2.src2SubIdx, subIdx. - /// Src1 has the same register class has dst, hence, there is - /// nothing to rewrite. - /// Src2.src2SubIdx, may not be register coalescer friendly. - /// Therefore, the first call to this method returns: - /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx). - /// (DstReg, DstSubReg) = (dst, subIdx). - /// - /// Subsequence calls will return false. - bool getNextRewritableSource(RegSubRegPair &Src, - RegSubRegPair &Dst) override { - // If we already get the only source we can rewrite, return false. - if (CurrentSrcIdx == 2) - return false; - // We are looking at v2 = INSERT_SUBREG v0, v1, sub0. - CurrentSrcIdx = 2; - const MachineOperand &MOInsertedReg = CopyLike.getOperand(2); - Src = RegSubRegPair(MOInsertedReg.getReg(), MOInsertedReg.getSubReg()); - const MachineOperand &MODef = CopyLike.getOperand(0); - - // We want to track something that is compatible with the - // partial definition. - if (MODef.getSubReg()) - // Bail if we have to compose sub-register indices. - return false; - Dst = RegSubRegPair(MODef.getReg(), - (unsigned)CopyLike.getOperand(3).getImm()); - return true; - } - - bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { - if (CurrentSrcIdx != 2) - return false; - // We are rewriting the inserted reg. - MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx); - MO.setReg(NewReg); - MO.setSubReg(NewSubReg); - return true; - } -}; - -/// Specialized rewriter for EXTRACT_SUBREG instruction. -class ExtractSubregRewriter : public Rewriter { - const TargetInstrInfo &TII; - -public: - ExtractSubregRewriter(MachineInstr &MI, const TargetInstrInfo &TII) - : Rewriter(MI), TII(TII) { - assert(MI.isExtractSubreg() && "Invalid instruction"); - } - - /// \see Rewriter::getNextRewritableSource() - /// Here CopyLike has the following form: - /// dst.dstSubIdx = EXTRACT_SUBREG Src, subIdx. - /// There is only one rewritable source: Src.subIdx, - /// which defines dst.dstSubIdx. - bool getNextRewritableSource(RegSubRegPair &Src, - RegSubRegPair &Dst) override { - // If we already get the only source we can rewrite, return false. - if (CurrentSrcIdx == 1) - return false; - // We are looking at v1 = EXTRACT_SUBREG v0, sub0. - CurrentSrcIdx = 1; - const MachineOperand &MOExtractedReg = CopyLike.getOperand(1); - // If we have to compose sub-register indices, bail out. - if (MOExtractedReg.getSubReg()) - return false; - - Src = - RegSubRegPair(MOExtractedReg.getReg(), CopyLike.getOperand(2).getImm()); +/// retrieve all Def -> Use along the way up to the next source. Any found +/// Use that is not itself a key for another entry, is the next source to +/// use. During the search for the next source, multiple sources can be found +/// given multiple incoming sources of a PHI instruction. In this case, we +/// look in each PHI source for the next source; all found next sources must +/// share the same register file as \p Reg and \p SubReg. The client should +/// then be capable to rewrite all intermediate PHIs to get the next source. +/// \return False if no alternative sources are available. True otherwise. +bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg, + RewriteMapTy &RewriteMap) { + // Do not try to find a new source for a physical register. + // So far we do not have any motivating example for doing that. + // Thus, instead of maintaining untested code, we will revisit that if + // that changes at some point. + Register Reg = RegSubReg.Reg; + if (Reg.isPhysical()) + return false; + const TargetRegisterClass *DefRC = MRI->getRegClass(Reg); - // We want to track something that is compatible with the definition. - const MachineOperand &MODef = CopyLike.getOperand(0); - Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg()); - return true; - } + SmallVector SrcToLook; + RegSubRegPair CurSrcPair = RegSubReg; + SrcToLook.push_back(CurSrcPair); - bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { - // The only source we can rewrite is the input register. - if (CurrentSrcIdx != 1) + unsigned PHICount = 0; + do { + CurSrcPair = SrcToLook.pop_back_val(); + // As explained above, do not handle physical registers + if (CurSrcPair.Reg.isPhysical()) return false; - CopyLike.getOperand(CurrentSrcIdx).setReg(NewReg); + ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, TII); - // If we find a source that does not require to extract something, - // rewrite the operation with a copy. - if (!NewSubReg) { - // Move the current index to an invalid position. - // We do not want another call to this method to be able - // to do any change. - CurrentSrcIdx = -1; - // Rewrite the operation as a COPY. - // Get rid of the sub-register index. - CopyLike.removeOperand(2); - // Morph the operation into a COPY. - CopyLike.setDesc(TII.get(TargetOpcode::COPY)); - return true; - } - CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg); - return true; - } -}; + // Follow the chain of copies until we find a more suitable source, a phi + // or have to abort. + while (true) { + ValueTrackerResult Res = ValTracker.getNextSource(); + // Abort at the end of a chain (without finding a suitable source). + if (!Res.isValid()) + return false; -/// Specialized rewriter for REG_SEQUENCE instruction. -class RegSequenceRewriter : public Rewriter { -public: - RegSequenceRewriter(MachineInstr &MI) : Rewriter(MI) { - assert(MI.isRegSequence() && "Invalid instruction"); - } + // Insert the Def -> Use entry for the recently found source. + ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair); + if (CurSrcRes.isValid()) { + assert(CurSrcRes == Res && "ValueTrackerResult found must match"); + // An existent entry with multiple sources is a PHI cycle we must avoid. + // Otherwise it's an entry with a valid next source we already found. + if (CurSrcRes.getNumSources() > 1) { + LLVM_DEBUG(dbgs() + << "findNextSource: found PHI cycle, aborting...\n"); + return false; + } + break; + } + RewriteMap.insert(std::make_pair(CurSrcPair, Res)); - /// \see Rewriter::getNextRewritableSource() - /// Here CopyLike has the following form: - /// dst = REG_SEQUENCE Src1.src1SubIdx, subIdx1, Src2.src2SubIdx, subIdx2. - /// Each call will return a different source, walking all the available - /// source. - /// - /// The first call returns: - /// (SrcReg, SrcSubReg) = (Src1, src1SubIdx). - /// (DstReg, DstSubReg) = (dst, subIdx1). - /// - /// The second call returns: - /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx). - /// (DstReg, DstSubReg) = (dst, subIdx2). - /// - /// And so on, until all the sources have been traversed, then - /// it returns false. - bool getNextRewritableSource(RegSubRegPair &Src, - RegSubRegPair &Dst) override { - // We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc. + // ValueTrackerResult usually have one source unless it's the result from + // a PHI instruction. Add the found PHI edges to be looked up further. + unsigned NumSrcs = Res.getNumSources(); + if (NumSrcs > 1) { + PHICount++; + if (PHICount >= RewritePHILimit) { + LLVM_DEBUG(dbgs() << "findNextSource: PHI limit reached\n"); + return false; + } - // If this is the first call, move to the first argument. - if (CurrentSrcIdx == 0) { - CurrentSrcIdx = 1; - } else { - // Otherwise, move to the next argument and check that it is valid. - CurrentSrcIdx += 2; - if (CurrentSrcIdx >= CopyLike.getNumOperands()) + for (unsigned i = 0; i < NumSrcs; ++i) + SrcToLook.push_back(Res.getSrc(i)); + break; + } + + CurSrcPair = Res.getSrc(0); + // Do not extend the live-ranges of physical registers as they add + // constraints to the register allocator. Moreover, if we want to extend + // the live-range of a physical register, unlike SSA virtual register, + // we will have to check that they aren't redefine before the related use. + if (CurSrcPair.Reg.isPhysical()) return false; - } - const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx); - Src.Reg = MOInsertedReg.getReg(); - // If we have to compose sub-register indices, bail out. - if ((Src.SubReg = MOInsertedReg.getSubReg())) - return false; - // We want to track something that is compatible with the related - // partial definition. - Dst.SubReg = CopyLike.getOperand(CurrentSrcIdx + 1).getImm(); + // Keep following the chain if the value isn't any better yet. + const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); + if (!TRI->shouldRewriteCopySrc(DefRC, RegSubReg.SubReg, SrcRC, + CurSrcPair.SubReg)) + continue; - const MachineOperand &MODef = CopyLike.getOperand(0); - Dst.Reg = MODef.getReg(); - // If we have to compose sub-registers, bail. - return MODef.getSubReg() == 0; - } + // We currently cannot deal with subreg operands on PHI instructions + // (see insertPHI()). + if (PHICount > 0 && CurSrcPair.SubReg != 0) + continue; - bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { - // We cannot rewrite out of bound operands. - // Moreover, rewritable sources are at odd positions. - if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands()) - return false; + // We found a suitable source, and are done with this chain. + break; + } + } while (!SrcToLook.empty()); - MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx); - MO.setReg(NewReg); - MO.setSubReg(NewSubReg); - return true; - } -}; + // If we did not find a more suitable source, there is nothing to optimize. + return CurSrcPair.Reg != Reg; +} -} // end anonymous namespace +/// Insert a PHI instruction with incoming edges \p SrcRegs that are +/// guaranteed to have the same register class. This is necessary whenever we +/// successfully traverse a PHI instruction and find suitable sources coming +/// from its edges. By inserting a new PHI, we provide a rewritten PHI def +/// suitable to be used in a new COPY instruction. +static MachineInstr &insertPHI(MachineRegisterInfo &MRI, + const TargetInstrInfo &TII, + const SmallVectorImpl &SrcRegs, + MachineInstr &OrigPHI) { + assert(!SrcRegs.empty() && "No sources to create a PHI instruction?"); -/// Get the appropriated Rewriter for \p MI. -/// \return A pointer to a dynamically allocated Rewriter or nullptr if no -/// rewriter works for \p MI. -static Rewriter *getCopyRewriter(MachineInstr &MI, const TargetInstrInfo &TII) { - // Handle uncoalescable copy-like instructions. - if (MI.isBitcast() || MI.isRegSequenceLike() || MI.isInsertSubregLike() || - MI.isExtractSubregLike()) - return new UncoalescableRewriter(MI); + const TargetRegisterClass *NewRC = MRI.getRegClass(SrcRegs[0].Reg); + // NewRC is only correct if no subregisters are involved. findNextSource() + // should have rejected those cases already. + assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand"); + Register NewVR = MRI.createVirtualRegister(NewRC); + MachineBasicBlock *MBB = OrigPHI.getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, &OrigPHI, OrigPHI.getDebugLoc(), + TII.get(TargetOpcode::PHI), NewVR); - switch (MI.getOpcode()) { - default: - return nullptr; - case TargetOpcode::COPY: - return new CopyRewriter(MI); - case TargetOpcode::INSERT_SUBREG: - return new InsertSubregRewriter(MI); - case TargetOpcode::EXTRACT_SUBREG: - return new ExtractSubregRewriter(MI, TII); - case TargetOpcode::REG_SEQUENCE: - return new RegSequenceRewriter(MI); + unsigned MBBOpIdx = 2; + for (const RegSubRegPair &RegPair : SrcRegs) { + MIB.addReg(RegPair.Reg, 0, RegPair.SubReg); + MIB.addMBB(OrigPHI.getOperand(MBBOpIdx).getMBB()); + // Since we're extended the lifetime of RegPair.Reg, clear the + // kill flags to account for that and make RegPair.Reg reaches + // the new PHI. + MRI.clearKillFlags(RegPair.Reg); + MBBOpIdx += 2; } + + return *MIB; } /// Given a \p Def.Reg and Def.SubReg pair, use \p RewriteMap to find @@ -1212,36 +1187,13 @@ getNewSource(MachineRegisterInfo *MRI, const TargetInstrInfo *TII, return RegSubRegPair(0, 0); } -/// Optimize generic copy instructions to avoid cross register bank copy. -/// The optimization looks through a chain of copies and tries to find a source -/// that has a compatible register class. -/// Two register classes are considered to be compatible if they share the same -/// register bank. -/// New copies issued by this optimization are register allocator -/// friendly. This optimization does not remove any copy as it may -/// overconstrain the register allocator, but replaces some operands -/// when possible. -/// \pre isCoalescableCopy(*MI) is true. -/// \return True, when \p MI has been rewritten. False otherwise. -bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) { - assert(isCoalescableCopy(MI) && "Invalid argument"); - assert(MI.getDesc().getNumDefs() == 1 && - "Coalescer can understand multiple defs?!"); - const MachineOperand &MODef = MI.getOperand(0); - // Do not rewrite physical definitions. - if (MODef.getReg().isPhysical()) - return false; - +bool PeepholeOptimizer::optimizeCoalescableCopyImpl(Rewriter &&CpyRewriter) { bool Changed = false; // Get the right rewriter for the current copy. - std::unique_ptr CpyRewriter(getCopyRewriter(MI, *TII)); - // If none exists, bail out. - if (!CpyRewriter) - return false; // Rewrite each rewritable source. RegSubRegPair Src; RegSubRegPair TrackPair; - while (CpyRewriter->getNextRewritableSource(Src, TrackPair)) { + while (CpyRewriter.getNextRewritableSource(Src, TrackPair)) { // Keep track of PHI nodes and its incoming edges when looking for sources. RewriteMapTy RewriteMap; // Try to find a more suitable source. If we failed to do so, or get the @@ -1257,12 +1209,13 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) { continue; // Rewrite source. - if (CpyRewriter->RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) { + if (CpyRewriter.RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) { // We may have extended the live-range of NewSrc, account for that. MRI->clearKillFlags(NewSrc.Reg); Changed = true; } } + // TODO: We could have a clean-up method to tidy the instruction. // E.g., v0 = INSERT_SUBREG v1, v1.sub0, sub0 // => v0 = COPY v1 @@ -1272,6 +1225,44 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) { return Changed; } +/// Optimize generic copy instructions to avoid cross register bank copy. +/// The optimization looks through a chain of copies and tries to find a source +/// that has a compatible register class. +/// Two register classes are considered to be compatible if they share the same +/// register bank. +/// New copies issued by this optimization are register allocator +/// friendly. This optimization does not remove any copy as it may +/// overconstrain the register allocator, but replaces some operands +/// when possible. +/// \pre isCoalescableCopy(*MI) is true. +/// \return True, when \p MI has been rewritten. False otherwise. +bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) { + assert(isCoalescableCopy(MI) && "Invalid argument"); + assert(MI.getDesc().getNumDefs() == 1 && + "Coalescer can understand multiple defs?!"); + const MachineOperand &MODef = MI.getOperand(0); + // Do not rewrite physical definitions. + if (MODef.getReg().isPhysical()) + return false; + + switch (MI.getOpcode()) { + case TargetOpcode::COPY: + return optimizeCoalescableCopyImpl(CopyRewriter(MI)); + case TargetOpcode::INSERT_SUBREG: + return optimizeCoalescableCopyImpl(InsertSubregRewriter(MI)); + case TargetOpcode::EXTRACT_SUBREG: + return optimizeCoalescableCopyImpl(ExtractSubregRewriter(MI, *TII)); + case TargetOpcode::REG_SEQUENCE: + return optimizeCoalescableCopyImpl(RegSequenceRewriter(MI)); + default: + // Handle uncoalescable copy-like instructions. + if (MI.isBitcast() || MI.isRegSequenceLike() || MI.isInsertSubregLike() || + MI.isExtractSubregLike()) + return optimizeCoalescableCopyImpl(UncoalescableRewriter(MI)); + return false; + } +} + /// Rewrite the source found through \p Def, by using the \p RewriteMap /// and create a new COPY instruction. More info about RewriteMap in /// PeepholeOptimizer::findNextSource. Right now this is only used to handle From ba70368f1380f8d22494fc8c100d2ab894a3cf94 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Thu, 23 Jan 2025 10:02:35 +0800 Subject: [PATCH 072/208] [Clang][Driver] Support linker relaxation options for LoongArch (#123587) This commit completed four tasks: - Add `-mrelax/-mno-relax` options support for LoongArch in clang driver. - Print error for `-gsplit-dwarf` with LoongArch linker relaxation (`-mrelax`). - Pass `-X` to linker to discard a plethora of `.L` symbols due to linker relaxation. - Forward `--no-relax` option to linker. --- .../clang/Basic/DiagnosticDriverKinds.td | 3 ++ .../lib/Driver/ToolChains/Arch/LoongArch.cpp | 19 +++++++++++ clang/lib/Driver/ToolChains/Gnu.cpp | 2 +- clang/test/Driver/loongarch-relax-features.c | 33 +++++++++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 clang/test/Driver/loongarch-relax-features.c diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 42c39ac6606c7f..612f7e330ba51e 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -808,6 +808,9 @@ def err_drv_loongarch_invalid_simd_option_combination : Error< def err_drv_loongarch_invalid_msimd_EQ : Error< "invalid argument '%0' to -msimd=; must be one of: none, lsx, lasx">; +def err_drv_loongarch_unsupported_with_linker_relaxation : Error< + "%0 is unsupported with LoongArch linker relaxation (-mrelax)">; + def err_drv_expand_response_file : Error< "failed to expand response file: %0">; diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index c9b45ce58bd4c9..e36272d2083206 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "LoongArch.h" +#include "../Clang.h" #include "ToolChains/CommonArgs.h" #include "clang/Basic/DiagnosticDriver.h" #include "clang/Driver/Driver.h" @@ -134,6 +135,24 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ))) Features.push_back("+lsx"); + // FIXME: Now we must use -mrelax to enable relax, maybe -mrelax will be set + // as default in the future. + if (const Arg *A = + Args.getLastArg(options::OPT_mrelax, options::OPT_mno_relax)) { + if (A->getOption().matches(options::OPT_mrelax)) { + Features.push_back("+relax"); + // -gsplit-dwarf -mrelax requires DW_AT_high_pc/DW_AT_ranges/... indexing + // into .debug_addr, which is currently not implemented. + Arg *A; + if (getDebugFissionKind(D, Args, A) != DwarfFissionKind::None) + D.Diag( + clang::diag::err_drv_loongarch_unsupported_with_linker_relaxation) + << A->getAsString(Args); + } else { + Features.push_back("-relax"); + } + } + std::string ArchName; const Arg *MArch = Args.getLastArg(options::OPT_march_EQ); if (MArch) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 6dfa94bf2123be..f56eeda3cb5f6f 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -422,7 +422,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, return; } - if (Triple.isRISCV()) { + if (Triple.isLoongArch() || Triple.isRISCV()) { CmdArgs.push_back("-X"); if (Args.hasArg(options::OPT_mno_relax)) CmdArgs.push_back("--no-relax"); diff --git a/clang/test/Driver/loongarch-relax-features.c b/clang/test/Driver/loongarch-relax-features.c new file mode 100644 index 00000000000000..c6ef15a8fcff0e --- /dev/null +++ b/clang/test/Driver/loongarch-relax-features.c @@ -0,0 +1,33 @@ +/// Test -m[no-]relax options. + +// RUN: %clang --target=loongarch32 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA32 +// RUN: %clang --target=loongarch64 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64 +// RUN: %clang --target=loongarch32 -mno-relax -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA32-NORELAX +// RUN: %clang --target=loongarch64 -mno-relax -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64-NORELAX +// RUN: %clang --target=loongarch32 -mrelax -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA32-RELAX +// RUN: %clang --target=loongarch64 -mrelax -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64-RELAX + +/// Error when using -gsplit-dwarf with linker relaxation (-mrelax). + +// RUN: %clang -### -c --target=loongarch32 -mno-relax -g -gsplit-dwarf %s 2>&1 | FileCheck %s --check-prefix=SPLIT-DWARF +// RUN: not %clang -c --target=loongarch32-linux-gnu -mrelax -gsplit-dwarf %s 2>&1 | FileCheck %s --check-prefix=ERR-SPLIT-DWARF +// RUN: not %clang -c --target=loongarch32 -mrelax -gsplit-dwarf=single %s 2>&1 | FileCheck %s --check-prefix=ERR-SPLIT-DWARF +// RUN: %clang -### -c --target=loongarch64 -mno-relax -g -gsplit-dwarf %s 2>&1 | FileCheck %s --check-prefix=SPLIT-DWARF +// RUN: not %clang -c --target=loongarch64-linux-gnu -mrelax -gsplit-dwarf %s 2>&1 | FileCheck %s --check-prefix=ERR-SPLIT-DWARF +// RUN: not %clang -c --target=loongarch64 -mrelax -gsplit-dwarf=single %s 2>&1 | FileCheck %s --check-prefix=ERR-SPLIT-DWARF + +// LA32: "target-features"="+32bit" +// LA64: "target-features"="+64bit,+d,+f,+lsx,+ual" + +// LA32-NORELAX: "target-features"="+32bit,-relax" +// LA64-NORELAX: "target-features"="+64bit,+d,+f,+lsx,+ual,-relax" + +// LA32-RELAX: "target-features"="+32bit,+relax" +// LA64-RELAX: "target-features"="+64bit,+d,+f,+lsx,+relax,+ual" + +// SPLIT-DWARF: "-split-dwarf-file" +// ERR-SPLIT-DWARF: error: -gsplit-dwarf{{.*}} is unsupported with LoongArch linker relaxation (-mrelax) + +int foo(void) { + return 3; +} From 15c2d4baf17292b4966d335846b30c50063f0265 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 Jan 2025 09:06:26 +0700 Subject: [PATCH 073/208] PeepholeOpt: Remove check for subreg index on a def operand (#123943) This is looking at operand 0 of a REG_SEQUENCE, which can never have a subregister index. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index aec4aaa81761c7..48c25d5039bfd4 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -426,8 +426,8 @@ class RegSequenceRewriter : public Rewriter { const MachineOperand &MODef = CopyLike.getOperand(0); Dst.Reg = MODef.getReg(); - // If we have to compose sub-registers, bail. - return MODef.getSubReg() == 0; + assert(MODef.getSubReg() == 0 && "cannot have subregister def in SSA"); + return true; } bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { From 9cefa3e6fccf30959433b96a8a275417b1429f4e Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Wed, 22 Jan 2025 18:17:14 -0800 Subject: [PATCH 074/208] [msan] Generalize handleIntrinsicByApplyingToShadow by adding bitcasting (#123474) `handleIntrinsicByApplyingToShadow` (introduced in https://github.com/llvm/llvm-project/pull/114490) requires that the intrinsic supports integer-ish operands; this is not the case for all intrinsics. This patch generalizes the function to bitcast the shadow arguments to be the same type as the original intrinsic, thus guaranteeing that the intrinsic exists. Additionally, it casts the computed shadow to be an appropriate shadow type. This function assumes that the intrinsic will handle arbitrary bit-patterns (for example, if the intrinsic accepts floats for var1, we assume that it works normally even if inputs are NaNs etc.). --- .../Transforms/Instrumentation/MemorySanitizer.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 587e5c1cc842e0..b3f52b35940836 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4008,6 +4008,10 @@ struct MemorySanitizerVisitor : public InstVisitor { /// shadow[out] = /// intrinsic(shadow[var1], shadow[var2], opType) | shadow[opType] /// + /// CAUTION: this assumes that the intrinsic will handle arbitrary + /// bit-patterns (for example, if the intrinsic accepts floats for + /// var1, we require that it doesn't care if inputs are NaNs). + /// /// For example, this can be applied to the Arm NEON vector table intrinsics /// (tbl{1,2,3,4}). /// @@ -4022,7 +4026,11 @@ struct MemorySanitizerVisitor : public InstVisitor { // Don't use getNumOperands() because it includes the callee for (unsigned int i = 0; i < I.arg_size() - trailingVerbatimArgs; i++) { Value *Shadow = getShadow(&I, i); - ShadowArgs.push_back(Shadow); + + // Shadows are integer-ish types but some intrinsics require a + // different (e.g., floating-point) type. + ShadowArgs.push_back( + IRB.CreateBitCast(Shadow, I.getArgOperand(i)->getType())); } for (unsigned int i = I.arg_size() - trailingVerbatimArgs; i < I.arg_size(); @@ -4043,7 +4051,7 @@ struct MemorySanitizerVisitor : public InstVisitor { CombinedShadow = IRB.CreateOr(Shadow, CombinedShadow, "_msprop"); } - setShadow(&I, CombinedShadow); + setShadow(&I, IRB.CreateBitCast(CombinedShadow, getShadowTy(&I))); setOriginForNaryOp(I); } From ba3e6f0f0f2bebeb2b82e976ea1b5df007784862 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 23 Jan 2025 10:17:39 +0800 Subject: [PATCH 075/208] [RISCV][VLOPT] Remove dead passthru check in getOperandLog2EEW. NFC (#123911) We already bail if the user is tied in checkUsers, which is true for all passthrus. Remove the check in getOperandLog2EEW so that it only worries about computing the OperandInfo, and leaves the passthru correctness to checkUsers. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 63f4ab4d572d59..5f2d4e0585a0b3 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -208,13 +208,6 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc()); const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags); - // We bail out early for instructions that have passthru with non NoRegister, - // which means they are using TU policy. We are not interested in these - // since they must preserve the entire register content. - if (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs() && - (MO.getReg() != RISCV::NoRegister)) - return std::nullopt; - bool IsMODef = MO.getOperandNo() == 0; // All mask operands have EEW=1 From 0fe8e70c6609ff86cd40fbb45a85a8ed04c153c2 Mon Sep 17 00:00:00 2001 From: Finn Plummer <50529406+inbelic@users.noreply.github.com> Date: Wed, 22 Jan 2025 18:22:03 -0800 Subject: [PATCH 076/208] Revert "Reland "[HLSL] Implement the `reflect` HLSL function"" (#124046) Reverts llvm/llvm-project#123853 The introduction of `reflect-error.ll` surfaced a bug with the use of `report_fatal_error` in `SPIRVInstructionSelector` that was propagated into the pr. This has caused a build-bot breakage, and the work to solve the underlying issue is tracked here: https://github.com/llvm/llvm-project/issues/124045. We can re-apply this commit when the underlying issue is resolved. --- clang/include/clang/Basic/BuiltinsSPIRV.td | 6 - clang/lib/CodeGen/CGBuiltin.cpp | 13 -- clang/lib/Headers/hlsl/hlsl_detail.h | 16 -- clang/lib/Headers/hlsl/hlsl_intrinsics.h | 43 ----- clang/lib/Sema/SemaSPIRV.cpp | 32 ---- clang/test/CodeGenHLSL/builtins/reflect.hlsl | 177 ------------------ clang/test/CodeGenSPIRV/Builtins/reflect.c | 32 ---- .../SemaHLSL/BuiltIns/reflect-errors.hlsl | 33 ---- .../test/SemaSPIRV/BuiltIns/reflect-errors.c | 23 --- llvm/include/llvm/IR/IntrinsicsSPIRV.td | 1 - .../Target/SPIRV/SPIRVInstructionSelector.cpp | 16 +- .../CodeGen/SPIRV/hlsl-intrinsics/reflect.ll | 33 ---- .../CodeGen/SPIRV/opencl/reflect-error.ll | 13 -- 13 files changed, 4 insertions(+), 434 deletions(-) delete mode 100644 clang/test/CodeGenHLSL/builtins/reflect.hlsl delete mode 100644 clang/test/CodeGenSPIRV/Builtins/reflect.c delete mode 100644 clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl delete mode 100644 clang/test/SemaSPIRV/BuiltIns/reflect-errors.c delete mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll delete mode 100644 llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll diff --git a/clang/include/clang/Basic/BuiltinsSPIRV.td b/clang/include/clang/Basic/BuiltinsSPIRV.td index 34933e889ba314..f72c555921dfe6 100644 --- a/clang/include/clang/Basic/BuiltinsSPIRV.td +++ b/clang/include/clang/Basic/BuiltinsSPIRV.td @@ -19,9 +19,3 @@ def SPIRVLength : Builtin { let Attributes = [NoThrow, Const]; let Prototype = "void(...)"; } - -def SPIRVReflect : Builtin { - let Spellings = ["__builtin_spirv_reflect"]; - let Attributes = [NoThrow, Const]; - let Prototype = "void(...)"; -} diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d1a533ca8d7091..f1515347fb816c 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20433,19 +20433,6 @@ Value *CodeGenFunction::EmitSPIRVBuiltinExpr(unsigned BuiltinID, /*ReturnType=*/X->getType()->getScalarType(), Intrinsic::spv_length, ArrayRef{X}, nullptr, "spv.length"); } - case SPIRV::BI__builtin_spirv_reflect: { - Value *I = EmitScalarExpr(E->getArg(0)); - Value *N = EmitScalarExpr(E->getArg(1)); - assert(E->getArg(0)->getType()->hasFloatingRepresentation() && - E->getArg(1)->getType()->hasFloatingRepresentation() && - "Reflect operands must have a float representation"); - assert(E->getArg(0)->getType()->isVectorType() && - E->getArg(1)->getType()->isVectorType() && - "Reflect operands must be a vector"); - return Builder.CreateIntrinsic( - /*ReturnType=*/I->getType(), Intrinsic::spv_reflect, - ArrayRef{I, N}, nullptr, "spv.reflect"); - } } return nullptr; } diff --git a/clang/lib/Headers/hlsl/hlsl_detail.h b/clang/lib/Headers/hlsl/hlsl_detail.h index 0d568539cd66a8..b2c8cc6c5c3dbb 100644 --- a/clang/lib/Headers/hlsl/hlsl_detail.h +++ b/clang/lib/Headers/hlsl/hlsl_detail.h @@ -79,22 +79,6 @@ constexpr enable_if_t::value || is_same::value, T> distance_vec_impl(vector X, vector Y) { return length_vec_impl(X - Y); } - -template -constexpr enable_if_t::value || is_same::value, T> -reflect_impl(T I, T N) { - return I - 2 * N * I * N; -} - -template -constexpr vector reflect_vec_impl(vector I, vector N) { -#if (__has_builtin(__builtin_spirv_reflect)) - return __builtin_spirv_reflect(I, N); -#else - return I - 2 * N * __builtin_hlsl_dot(I, N); -#endif -} - } // namespace __detail } // namespace hlsl #endif //_HLSL_HLSL_DETAILS_H_ diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index 3b47074f07ecf4..d1e4eb08aa7646 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -2008,49 +2008,6 @@ double3 rcp(double3); _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rcp) double4 rcp(double4); -//===----------------------------------------------------------------------===// -// reflect builtin -//===----------------------------------------------------------------------===// - -/// \fn T reflect(T I, T N) -/// \brief Returns a reflection using an incident ray, \a I, and a surface -/// normal, \a N. -/// \param I The incident ray. -/// \param N The surface normal. -/// -/// The return value is a floating-point vector that represents the reflection -/// of the incident ray, \a I, off a surface with the normal \a N. -/// -/// This function calculates the reflection vector using the following formula: -/// V = I - 2 * N * dot(I N) . -/// -/// N must already be normalized in order to achieve the desired result. -/// -/// The operands must all be a scalar or vector whose component type is -/// floating-point. -/// -/// Result type and the type of all operands must be the same type. - -_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) -const inline half reflect(half I, half N) { - return __detail::reflect_impl(I, N); -} - -const inline float reflect(float I, float N) { - return __detail::reflect_impl(I, N); -} - -template -_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) -const inline vector reflect(vector I, vector N) { - return __detail::reflect_vec_impl(I, N); -} - -template -const inline vector reflect(vector I, vector N) { - return __detail::reflect_vec_impl(I, N); -} - //===----------------------------------------------------------------------===// // rsqrt builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaSPIRV.cpp b/clang/lib/Sema/SemaSPIRV.cpp index 94534485e07c33..dc49fc79073572 100644 --- a/clang/lib/Sema/SemaSPIRV.cpp +++ b/clang/lib/Sema/SemaSPIRV.cpp @@ -69,38 +69,6 @@ bool SemaSPIRV::CheckSPIRVBuiltinFunctionCall(unsigned BuiltinID, TheCall->setType(RetTy); break; } - case SPIRV::BI__builtin_spirv_reflect: { - if (SemaRef.checkArgCount(TheCall, 2)) - return true; - - ExprResult A = TheCall->getArg(0); - QualType ArgTyA = A.get()->getType(); - auto *VTyA = ArgTyA->getAs(); - if (VTyA == nullptr) { - SemaRef.Diag(A.get()->getBeginLoc(), - diag::err_typecheck_convert_incompatible) - << ArgTyA - << SemaRef.Context.getVectorType(ArgTyA, 2, VectorKind::Generic) << 1 - << 0 << 0; - return true; - } - - ExprResult B = TheCall->getArg(1); - QualType ArgTyB = B.get()->getType(); - auto *VTyB = ArgTyB->getAs(); - if (VTyB == nullptr) { - SemaRef.Diag(A.get()->getBeginLoc(), - diag::err_typecheck_convert_incompatible) - << ArgTyB - << SemaRef.Context.getVectorType(ArgTyB, 2, VectorKind::Generic) << 1 - << 0 << 0; - return true; - } - - QualType RetTy = ArgTyA; - TheCall->setType(RetTy); - break; - } } return false; } diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl deleted file mode 100644 index 35ee059697c4ba..00000000000000 --- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl +++ /dev/null @@ -1,177 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -O1 -o - | FileCheck %s -// RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ -// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK - -// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh( -// CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000 -// CHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[N]], [[N]] -// CHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[TMP0]], [[MUL_I]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[I]], [[MUL2_I]] -// CHECK-NEXT: ret half [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh( -// SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000 -// SPVCHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[N]], [[N]] -// SPVCHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[TMP0]], [[MUL_I]] -// SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[I]], [[MUL2_I]] -// SPVCHECK-NEXT: ret half [[SUB_I]] -// -half test_reflect_half(half I, half N) { - return reflect(I, N); -} - -// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( -// CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[I]], <2 x half> [[N]]) -// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[DOTSCALAR]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[TMP1]], [[N]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[I]], [[MUL1_I]] -// CHECK-NEXT: ret <2 x half> [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( -// SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> [[I]], <2 x half> [[N]]) -// SPVCHECK-NEXT: ret <2 x half> [[SPV_REFLECT_I]] -// -half2 test_reflect_half2(half2 I, half2 N) { - return reflect(I, N); -} - -// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( -// CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[I]], <3 x half> [[N]]) -// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x half> poison, half [[DOTSCALAR]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <3 x i32> zeroinitializer -// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[TMP1]], [[N]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[I]], [[MUL1_I]] -// CHECK-NEXT: ret <3 x half> [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( -// SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> [[I]], <3 x half> [[N]]) -// SPVCHECK-NEXT: ret <3 x half> [[SPV_REFLECT_I]] -// -half3 test_reflect_half3(half3 I, half3 N) { - return reflect(I, N); -} - -// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( -// CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[I]], <4 x half> [[N]]) -// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[DOTSCALAR]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[TMP1]], [[N]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[I]], [[MUL1_I]] -// CHECK-NEXT: ret <4 x half> [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( -// SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> [[I]], <4 x half> [[N]]) -// SPVCHECK-NEXT: ret <4 x half> [[SPV_REFLECT_I]] -// -half4 test_reflect_half4(half4 I, half4 N) { - return reflect(I, N); -} - -// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_reflect_floatff( -// CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00 -// CHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[N]], [[N]] -// CHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP0]], [[MUL_I]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[I]], [[MUL2_I]] -// CHECK-NEXT: ret float [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff( -// SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00 -// SPVCHECK-NEXT: [[TMP0:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[N]], [[N]] -// SPVCHECK-NEXT: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP0]], [[MUL_I]] -// SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[I]], [[MUL2_I]] -// SPVCHECK-NEXT: ret float [[SUB_I]] -// -float test_reflect_float(float I, float N) { - return reflect(I, N); -} - -// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( -// CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[I]], <2 x float> [[N]]) -// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[DOTSCALAR]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[TMP1]], [[N]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[I]], [[MUL1_I]] -// CHECK-NEXT: ret <2 x float> [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( -// SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> [[I]], <2 x float> [[N]]) -// SPVCHECK-NEXT: ret <2 x float> [[SPV_REFLECT_I]] -// -float2 test_reflect_float2(float2 I, float2 N) { - return reflect(I, N); -} - -// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( -// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[I]], <3 x float> [[N]]) -// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x float> poison, float [[DOTSCALAR]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <3 x i32> zeroinitializer -// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[TMP1]], [[N]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[I]], [[MUL1_I]] -// CHECK-NEXT: ret <3 x float> [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( -// SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> [[I]], <3 x float> [[N]]) -// SPVCHECK-NEXT: ret <3 x float> [[SPV_REFLECT_I]] -// -float3 test_reflect_float3(float3 I, float3 N) { - return reflect(I, N); -} - -// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( -// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[I]], <4 x float> [[N]]) -// CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[DOTSCALAR]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[TMP1]], [[N]] -// CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[I]], [[MUL1_I]] -// CHECK-NEXT: ret <4 x float> [[SUB_I]] -// -// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( -// SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { -// SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> [[I]], <4 x float> [[N]]) -// SPVCHECK-NEXT: ret <4 x float> [[SPV_REFLECT_I]] -// -float4 test_reflect_float4(float4 I, float4 N) { - return reflect(I, N); -} diff --git a/clang/test/CodeGenSPIRV/Builtins/reflect.c b/clang/test/CodeGenSPIRV/Builtins/reflect.c deleted file mode 100644 index f51ac27a07457a..00000000000000 --- a/clang/test/CodeGenSPIRV/Builtins/reflect.c +++ /dev/null @@ -1,32 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 - -// RUN: %clang_cc1 -O1 -triple spirv-pc-vulkan-compute %s -emit-llvm -o - | FileCheck %s - -typedef float float2 __attribute__((ext_vector_type(2))); -typedef float float3 __attribute__((ext_vector_type(3))); -typedef float float4 __attribute__((ext_vector_type(4))); - -// CHECK-LABEL: define spir_func <2 x float> @test_reflect_float2( -// CHECK-SAME: <2 x float> noundef [[X:%.*]], <2 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SPV_REFLECT:%.*]] = tail call <2 x float> @llvm.spv.reflect.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) -// CHECK-NEXT: ret <2 x float> [[SPV_REFLECT]] -// -float2 test_reflect_float2(float2 X, float2 Y) { return __builtin_spirv_reflect(X, Y); } - -// CHECK-LABEL: define spir_func <3 x float> @test_reflect_float3( -// CHECK-SAME: <3 x float> noundef [[X:%.*]], <3 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SPV_REFLECT:%.*]] = tail call <3 x float> @llvm.spv.reflect.v3f32(<3 x float> [[X]], <3 x float> [[Y]]) -// CHECK-NEXT: ret <3 x float> [[SPV_REFLECT]] -// -float3 test_reflect_float3(float3 X, float3 Y) { return __builtin_spirv_reflect(X, Y); } - -// CHECK-LABEL: define spir_func <4 x float> @test_reflect_float4( -// CHECK-SAME: <4 x float> noundef [[X:%.*]], <4 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SPV_REFLECT:%.*]] = tail call <4 x float> @llvm.spv.reflect.v4f32(<4 x float> [[X]], <4 x float> [[Y]]) -// CHECK-NEXT: ret <4 x float> [[SPV_REFLECT]] -// -float4 test_reflect_float4(float4 X, float4 Y) { return __builtin_spirv_reflect(X, Y); } - diff --git a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl deleted file mode 100644 index 28cf992ed602bf..00000000000000 --- a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl +++ /dev/null @@ -1,33 +0,0 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify - -float test_no_second_arg(float2 p0) { - return reflect(p0); - // expected-error@-1 {{no matching function for call to 'reflect'}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 1 was provided}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 1 was provided}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 1 was provided}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 1 was provided}} -} - -float test_too_many_arg(float2 p0) { - return reflect(p0, p0, p0); - // expected-error@-1 {{no matching function for call to 'reflect'}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 3 were provided}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 2 arguments, but 3 were provided}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 3 were provided}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 2 arguments, but 3 were provided}} -} - -float test_double_inputs(double p0, double p1) { - return reflect(p0, p1); - // expected-error@-1 {{call to 'reflect' is ambiguous}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} -} - -float test_int_inputs(int p0, int p1) { - return reflect(p0, p1); - // expected-error@-1 {{call to 'reflect' is ambiguous}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} - // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} -} diff --git a/clang/test/SemaSPIRV/BuiltIns/reflect-errors.c b/clang/test/SemaSPIRV/BuiltIns/reflect-errors.c deleted file mode 100644 index c93dd2ffcc9c3e..00000000000000 --- a/clang/test/SemaSPIRV/BuiltIns/reflect-errors.c +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: %clang_cc1 %s -triple spirv-pc-vulkan-compute -verify - -typedef float float2 __attribute__((ext_vector_type(2))); - -float2 test_no_second_arg(float2 p0) { - return __builtin_spirv_reflect(p0); - // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} -} - -float2 test_too_many_arg(float2 p0) { - return __builtin_spirv_reflect(p0, p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} -} - -float test_double_scalar_inputs(double p0, double p1) { - return __builtin_spirv_reflect(p0, p1); - // expected-error@-1 {{passing 'double' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(double)))) double' (vector of 2 'double' values)}} -} - -float test_int_scalar_inputs(int p0, int p1) { - return __builtin_spirv_reflect(p0, p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(int)))) int' (vector of 2 'int' values)}} -} diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 4da464d8010f76..be337dbccaf8a9 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -71,7 +71,6 @@ let TargetPrefix = "spv" in { [IntrNoMem] >; def int_spv_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; - def int_spv_reflect : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_spv_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [LLVMMatchType<0>, llvm_anyfloat_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 8257ad10dd8c41..f5409c27d6ea3d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -270,8 +270,10 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectPhi(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - bool selectExtInst(Register ResVReg, const SPIRVType *RestType, - MachineInstr &I, GL::GLSLExtInst GLInst) const; + [[maybe_unused]] bool selectExtInst(Register ResVReg, + const SPIRVType *RestType, + MachineInstr &I, + GL::GLSLExtInst GLInst) const; bool selectExtInst(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, CL::OpenCLExtInst CLInst) const; bool selectExtInst(Register ResVReg, const SPIRVType *ResType, @@ -900,14 +902,6 @@ bool SPIRVInstructionSelector::selectExtInst(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, GL::GLSLExtInst GLInst) const { - if (!STI.canUseExtInstSet( - SPIRV::InstructionSet::InstructionSet::GLSL_std_450)) { - std::string DiagMsg; - raw_string_ostream OS(DiagMsg); - I.print(OS, true, false, false, false); - DiagMsg += " is only supported with the GLSL extended instruction set.\n"; - report_fatal_error(DiagMsg.c_str(), false); - } return selectExtInst(ResVReg, ResType, I, {{SPIRV::InstructionSet::GLSL_std_450, GLInst}}); } @@ -3036,8 +3030,6 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); - case Intrinsic::spv_reflect: - return selectExtInst(ResVReg, ResType, I, GL::Reflect); case Intrinsic::spv_rsqrt: return selectExtInst(ResVReg, ResType, I, CL::rsqrt, GL::InverseSqrt); case Intrinsic::spv_sign: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll deleted file mode 100644 index 18962807f84ffc..00000000000000 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reflect.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for reflect are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 - -define noundef <4 x half> @reflect_half4(<4 x half> noundef %a, <4 x half> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Reflect %[[#arg0]] %[[#arg1]] - %spv.reflect = call <4 x half> @llvm.spv.reflect.f16(<4 x half> %a, <4 x half> %b) - ret <4 x half> %spv.reflect -} - -define noundef <4 x float> @reflect_float4(<4 x float> noundef %a, <4 x float> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Reflect %[[#arg0]] %[[#arg1]] - %spv.reflect = call <4 x float> @llvm.spv.reflect.f32(<4 x float> %a, <4 x float> %b) - ret <4 x float> %spv.reflect -} - -declare <4 x half> @llvm.spv.reflect.f16(<4 x half>, <4 x half>) -declare <4 x float> @llvm.spv.reflect.f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll b/llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll deleted file mode 100644 index 3b3edc13131f58..00000000000000 --- a/llvm/test/CodeGen/SPIRV/opencl/reflect-error.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s -; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s - -; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.reflect), %{{.*}}, %{{.*}} is only supported with the GLSL extended instruction set. - -define noundef <4 x float> @reflect_float4(<4 x float> noundef %a, <4 x float> noundef %b) { -entry: - %spv.reflect = call <4 x float> @llvm.spv.reflect.f32(<4 x float> %a, <4 x float> %b) - ret <4 x float> %spv.reflect -} - -declare <4 x float> @llvm.spv.reflect.f32(<4 x float>, <4 x float>) - From 3ef90f843fee74ff811ef88246734475f50e2073 Mon Sep 17 00:00:00 2001 From: Jianjian Guan Date: Thu, 23 Jan 2025 10:30:30 +0800 Subject: [PATCH 077/208] [emitc] Fix the translation switchop with argument of expressionop (#123701) Now a `emitc.switch` with argument of `emitc.expression` wouldn't emit its argument to cpp. This patch fix it. --- mlir/lib/Target/Cpp/TranslateToCpp.cpp | 5 +++- mlir/test/Target/Cpp/switch.mlir | 36 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp index a91f5ab9311401..01de0e41f20353 100644 --- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp +++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp @@ -475,7 +475,10 @@ static LogicalResult printOperation(CppEmitter &emitter, emitc::SwitchOp switchOp) { raw_indented_ostream &os = emitter.ostream(); - os << "\nswitch (" << emitter.getOrCreateName(switchOp.getArg()) << ") {"; + os << "\nswitch ("; + if (failed(emitter.emitOperand(switchOp.getArg()))) + return failure(); + os << ") {"; for (auto pair : llvm::zip(switchOp.getCases(), switchOp.getCaseRegions())) { os << "\ncase " << std::get<0>(pair) << ": {\n"; diff --git a/mlir/test/Target/Cpp/switch.mlir b/mlir/test/Target/Cpp/switch.mlir index 1a8f5e2dfd2b61..4e20c1fc6536a4 100644 --- a/mlir/test/Target/Cpp/switch.mlir +++ b/mlir/test/Target/Cpp/switch.mlir @@ -882,3 +882,39 @@ func.func @emitc_switch_ui64() { } return } + +// CPP-DEFAULT-LABEL: void emitc_switch_expression() { +// CPP-DEFAULT: int64_t v1 = 42; +// CPP-DEFAULT: switch (-v1) { +// CPP-DEFAULT: default: { +// CPP-DEFAULT: break; +// CPP-DEFAULT: } +// CPP-DEFAULT: } +// CPP-DEFAULT: return; +// CPP-DEFAULT: } + +// CPP-DECLTOP-LABEL: void emitc_switch_expression() { +// CPP-DECLTOP: int64_t v1; +// CPP-DECLTOP: v1 = 42; +// CPP-DECLTOP: switch (-v1) { +// CPP-DECLTOP: default: { +// CPP-DECLTOP: break; +// CPP-DECLTOP: } +// CPP-DECLTOP: } +// CPP-DECLTOP: return; +// CPP-DECLTOP: } + +func.func @emitc_switch_expression() { + %x = "emitc.constant"(){value = 42 : i64} : () -> i64 + + %0 = emitc.expression : i64 { + %a = emitc.unary_minus %x : (i64) -> i64 + emitc.yield %a : i64 + } + + emitc.switch %0 : i64 + default { + emitc.yield + } + return +} From b46fcb9fa32f24660b1b8858d5c4cbdb76ef9d8b Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Thu, 23 Jan 2025 10:53:00 +0800 Subject: [PATCH 078/208] [Clang] Implement CWG 2628 "Implicit deduction guides should propagate constraints" (#111143) Closes https://github.com/llvm/llvm-project/issues/98592 --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 70 ++++++-- clang/test/CXX/drs/cwg26xx.cpp | 19 +- clang/test/SemaTemplate/deduction-guide.cpp | 163 ++++++++++++++++++ clang/www/cxx_dr_status.html | 2 +- 5 files changed, 227 insertions(+), 30 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c749e34d6d2c5d..abc9ce60e7d01f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -363,6 +363,9 @@ Resolutions to C++ Defect Reports - Clang now allows trailing requires clause on explicit deduction guides. (`CWG2707: Deduction guides cannot have a trailing requires-clause `_). +- Respect constructor constraints during CTAD. + (`CWG2628: Implicit deduction guides should propagate constraints `_). + - Clang now diagnoses a space in the first production of a ``literal-operator-id`` by default. (`CWG2521: User-defined literals and reserved identifiers `_). diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index 5f813ba3a597a3..950783303efb32 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -194,12 +194,14 @@ class ExtractTypeForDeductionGuide // A deduction guide can be either a template or a non-template function // declaration. If \p TemplateParams is null, a non-template function // declaration will be created. -NamedDecl *buildDeductionGuide( - Sema &SemaRef, TemplateDecl *OriginalTemplate, - TemplateParameterList *TemplateParams, CXXConstructorDecl *Ctor, - ExplicitSpecifier ES, TypeSourceInfo *TInfo, SourceLocation LocStart, - SourceLocation Loc, SourceLocation LocEnd, bool IsImplicit, - llvm::ArrayRef MaterializedTypedefs = {}) { +NamedDecl * +buildDeductionGuide(Sema &SemaRef, TemplateDecl *OriginalTemplate, + TemplateParameterList *TemplateParams, + CXXConstructorDecl *Ctor, ExplicitSpecifier ES, + TypeSourceInfo *TInfo, SourceLocation LocStart, + SourceLocation Loc, SourceLocation LocEnd, bool IsImplicit, + llvm::ArrayRef MaterializedTypedefs = {}, + Expr *FunctionTrailingRC = nullptr) { DeclContext *DC = OriginalTemplate->getDeclContext(); auto DeductionGuideName = SemaRef.Context.DeclarationNames.getCXXDeductionGuideName( @@ -210,9 +212,9 @@ NamedDecl *buildDeductionGuide( TInfo->getTypeLoc().castAs().getParams(); // Build the implicit deduction guide template. - auto *Guide = - CXXDeductionGuideDecl::Create(SemaRef.Context, DC, LocStart, ES, Name, - TInfo->getType(), TInfo, LocEnd, Ctor); + auto *Guide = CXXDeductionGuideDecl::Create( + SemaRef.Context, DC, LocStart, ES, Name, TInfo->getType(), TInfo, LocEnd, + Ctor, DeductionCandidate::Normal, FunctionTrailingRC); Guide->setImplicit(IsImplicit); Guide->setParams(Params); @@ -354,10 +356,11 @@ struct ConvertConstructorToDeductionGuideTransform { // template arguments) of the constructor, if any. TemplateParameterList *TemplateParams = SemaRef.GetTemplateParameterList(Template); + SmallVector Depth1Args; + Expr *OuterRC = TemplateParams->getRequiresClause(); if (FTD) { TemplateParameterList *InnerParams = FTD->getTemplateParameters(); SmallVector AllParams; - SmallVector Depth1Args; AllParams.reserve(TemplateParams->size() + InnerParams->size()); AllParams.insert(AllParams.begin(), TemplateParams->begin(), TemplateParams->end()); @@ -390,7 +393,7 @@ struct ConvertConstructorToDeductionGuideTransform { /*EvaluateConstraint=*/false); } - assert(NewParam->getTemplateDepth() == 0 && + assert(getDepthAndIndex(NewParam).first == 0 && "Unexpected template parameter depth"); AllParams.push_back(NewParam); @@ -406,10 +409,11 @@ struct ConvertConstructorToDeductionGuideTransform { Args.addOuterRetainedLevel(); if (NestedPattern) Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); - ExprResult E = SemaRef.SubstExpr(InnerRC, Args); - if (E.isInvalid()) + ExprResult E = + SemaRef.SubstConstraintExprWithoutSatisfaction(InnerRC, Args); + if (!E.isUsable()) return nullptr; - RequiresClause = E.getAs(); + RequiresClause = E.get(); } TemplateParams = TemplateParameterList::Create( @@ -445,10 +449,46 @@ struct ConvertConstructorToDeductionGuideTransform { return nullptr; TypeSourceInfo *NewTInfo = TLB.getTypeSourceInfo(SemaRef.Context, NewType); + // At this point, the function parameters are already 'instantiated' in the + // current scope. Substitute into the constructor's trailing + // requires-clause, if any. + Expr *FunctionTrailingRC = nullptr; + if (Expr *RC = CD->getTrailingRequiresClause()) { + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(Depth1Args); + Args.addOuterRetainedLevel(); + if (NestedPattern) + Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); + ExprResult E = SemaRef.SubstConstraintExprWithoutSatisfaction(RC, Args); + if (!E.isUsable()) + return nullptr; + FunctionTrailingRC = E.get(); + } + + // C++ [over.match.class.deduct]p1: + // If C is defined, for each constructor of C, a function template with + // the following properties: + // [...] + // - The associated constraints are the conjunction of the associated + // constraints of C and the associated constraints of the constructor, if + // any. + if (OuterRC) { + // The outer template parameters are not transformed, so their + // associated constraints don't need substitution. + if (!FunctionTrailingRC) + FunctionTrailingRC = OuterRC; + else + FunctionTrailingRC = BinaryOperator::Create( + SemaRef.Context, /*lhs=*/OuterRC, /*rhs=*/FunctionTrailingRC, + BO_LAnd, SemaRef.Context.BoolTy, VK_PRValue, OK_Ordinary, + TemplateParams->getTemplateLoc(), FPOptionsOverride()); + } + return buildDeductionGuide( SemaRef, Template, TemplateParams, CD, CD->getExplicitSpecifier(), NewTInfo, CD->getBeginLoc(), CD->getLocation(), CD->getEndLoc(), - /*IsImplicit=*/true, MaterializedTypedefs); + /*IsImplicit=*/true, MaterializedTypedefs, FunctionTrailingRC); } /// Build a deduction guide with the specified parameter types. diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp index efc49b0b502a7c..a817a1ba3e31db 100644 --- a/clang/test/CXX/drs/cwg26xx.cpp +++ b/clang/test/CXX/drs/cwg26xx.cpp @@ -132,27 +132,18 @@ struct E { #endif } // namespace cwg2627 -namespace cwg2628 { // cwg2628: no - // this was reverted for the 16.x release - // due to regressions, see the issue for more details: - // https://github.com/llvm/llvm-project/issues/60777 +namespace cwg2628 { // cwg2628: 20 #if __cplusplus >= 202002L template struct foo { - // The expected notes below should be removed when cwg2628 is fully implemented again - constexpr foo() requires (!A && !B) = delete; // #cwg2628-ctor-1 - constexpr foo() requires (A || B) = delete; // #cwg2628-ctor-2 + constexpr foo() requires (!A && !B) = delete; // #cwg2628-ctor + constexpr foo() requires (A || B) = delete; }; void f() { - // The FIXME's below should be the expected errors when cwg2628 is - // fully implemented again. foo fooable; // #cwg2628-fooable - // since-cxx20-error@-1 {{ambiguous deduction for template arguments of 'foo'}} - // since-cxx20-note@#cwg2628-ctor-1 {{candidate function [with A = false, B = false]}} - // since-cxx20-note@#cwg2628-ctor-2 {{candidate function [with A = false, B = false]}} - // FIXME-since-cxx20-error@#cwg2628-fooable {{call to deleted}} - // FIXME-since-cxx20-note@#cwg2628-ctor {{marked deleted here}} + // since-cxx20-error@#cwg2628-fooable {{call to deleted}} + // since-cxx20-note@#cwg2628-ctor {{marked deleted here}} } #endif } // namespace cwg2628 diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp index 39250f0617f4b5..c0b56f9e6ac7b8 100644 --- a/clang/test/SemaTemplate/deduction-guide.cpp +++ b/clang/test/SemaTemplate/deduction-guide.cpp @@ -479,6 +479,169 @@ A a{.f1 = {1}}; } // namespace GH83368 +namespace GH60777 { + +template constexpr bool True() { return true; } + +template + requires(sizeof(T) > 1) +struct A { + template + requires(sizeof...(Ts) == 0) + A(T val, Ts... tail) + requires(True()) + {} +}; + +A a(42); + +// `requires (sizeof(T) > 1)` goes into the deduction guide together with +// `requires (True())`, while `requires(sizeof...(Ts) == 0)` goes into +// the template parameter list of the synthesized declaration. + +// CHECK-LABEL: Dumping GH60777::: +// CHECK-NEXT: FunctionTemplateDecl 0x{{.+}} <{{.+}}> {{.+}} implicit +// CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <{{.+}}> col:20 referenced typename depth 0 index 0 T +// CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <{{.+}}> col:25 typename depth 0 index 1 ... Ts +// CHECK-NEXT: |-ParenExpr 0x{{.+}} <{{.+}}> 'bool' +// CHECK-NEXT: | `-BinaryOperator 0x{{.+}} <{{.+}}> 'bool' '==' +// CHECK-NEXT: | |-SizeOfPackExpr {{.+}} Ts +// CHECK-NEXT: | | `-TemplateArgument type 'Ts...':'type-parameter-0-1...' +// CHECK-NEXT: | | `-PackExpansionType 0x{{.+}} 'Ts...' dependent +// CHECK-NEXT: | | `-TemplateTypeParmType 0x{{.+}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack +// CHECK-NEXT: | | `-TemplateTypeParm 0x{{.+}} 'Ts' +// CHECK-NEXT: | `-ImplicitCastExpr {{.+}} +// CHECK-NEXT: | `-IntegerLiteral 0x{{.+}} <{{.+}}> 'int' 0 +// CHECK-NEXT: |-CXXDeductionGuideDecl 0x{{.+}} <{{.+}}> line:{{.+}} implicit 'auto (T, Ts...) -> A' +// CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} <{{.+}}> col:{{.+}} val 'T' +// CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} <{{.+}}> col:{{.+}} tail 'Ts...' pack +// CHECK-NEXT: | `-BinaryOperator 0x{{.+}} <{{.+}}> 'bool' '&&' +// CHECK-NEXT: | |-ParenExpr 0x{{.+}} <{{.+}}> 'bool' +// CHECK-NEXT: | | `-BinaryOperator 0x{{.+}} <{{.+}}> 'bool' '>' +// CHECK-NEXT: | | |-UnaryExprOrTypeTraitExpr {{.+}} sizeof 'T' +// CHECK-NEXT: | | `-ImplicitCastExpr {{.+}} +// CHECK-NEXT: | | `-IntegerLiteral 0x{{.+}} <{{.+}}> 'int' 1 +// CHECK-NEXT: | `-ParenExpr 0x{{.+}} <{{.+}}> '' +// CHECK-NEXT: | `-CallExpr 0x{{.+}} <{{.+}}> '' +// CHECK-NEXT: | `-UnresolvedLookupExpr 0x{{.+}} '' {{.+}} +// CHECK-NEXT: | `-TemplateArgument type 'Ts...':'type-parameter-0-1...' +// CHECK-NEXT: | `-PackExpansionType 0x{{.+}} 'Ts...' dependent +// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack +// CHECK-NEXT: | `-TemplateTypeParm 0x{{.+}} 'Ts' + +template +struct B { + template + B(T val, Ts... tail) + requires(True()) + {} +}; + +B b(42, 43); +// expected-error@-1 {{no viable constructor}} \ +// expected-note@-6 {{constraints not satisfied}} \ +// expected-note@-5 {{because substituted constraint expression is ill-formed}} \ +// expected-note@-6 {{implicit deduction guide declared as 'template B(T val, Ts ...tail) -> B requires (True())'}} \ +// expected-note@-8 {{function template not viable}} \ +// expected-note@-8 {{implicit deduction guide declared as 'template B(B) -> B'}} + +} // namespace GH60777 + +// Examples from @hokein. +namespace GH98592 { + +template concept True = true; +double arr3[3]; + +template +struct X { + const int size; + template + constexpr X(T, U(&)[3]) requires True : size(sizeof(T)) {} +}; + +template +X(T, U (&)[3]) -> X; + +constexpr X x(3, arr3); + +// The synthesized deduction guide is more constrained than the explicit one. +static_assert(x.size == 4); + +// CHECK-LABEL: Dumping GH98592::: +// CHECK-NEXT: FunctionTemplateDecl 0x{{.+}} <{{.+}}> col:13 implicit +// CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <{{.+}}> col:17 referenced class depth 0 index 0 T +// CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <{{.+}}> col:19 class depth 0 index 1 U +// CHECK-NEXT: |-CXXDeductionGuideDecl 0x{{.+}} <{{.+}}> col:13 implicit 'auto (T, U (&)[3]) -> X' +// CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} col:16 'T' +// CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} col:21 'U (&)[3]' +// CHECK-NEXT: | `-ConceptSpecializationExpr 0x{{.+}} 'bool' Concept 0x{{.+}} 'True' +// CHECK-NEXT: | |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28 +// CHECK-NEXT: | | `-TemplateArgument type 'type-parameter-0-0' +// CHECK-NEXT: | | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0 +// CHECK-NEXT: | `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0' +// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 +// CHECK-NEXT: | `-TemplateTypeParm 0x{{.+}} 'T' +// CHECK-NEXT: `-CXXDeductionGuideDecl 0x{{.+}} col:13 implicit used 'auto (int, double (&)[3]) -> GH98592::X' implicit_instantiation +// CHECK-NEXT: |-TemplateArgument type 'int' +// CHECK-NEXT: | `-BuiltinType 0x{{.+}} 'int' +// CHECK-NEXT: |-TemplateArgument type 'double' +// CHECK-NEXT: | `-BuiltinType 0x{{.+}} 'double' +// CHECK-NEXT: |-ParmVarDecl 0x{{.+}} col:16 'int' +// CHECK-NEXT: |-ParmVarDecl 0x{{.+}} col:21 'double (&)[3]' +// CHECK-NEXT: `-ConceptSpecializationExpr 0x{{.+}} 'bool' Concept 0x{{.+}} 'True' +// CHECK-NEXT: |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28 +// CHECK-NEXT: | `-TemplateArgument type 'type-parameter-0-0' +// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0 +// CHECK-NEXT: `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0' +// CHECK-NEXT: `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 +// CHECK-NEXT: `-TemplateTypeParm 0x{{.+}} 'T' + +template requires True struct Y { + const int size; + template + constexpr Y(T, U(&)[3]) : size(sizeof(T)) {} +}; + +template Y(T, U (&)[3]) -> Y; + +constexpr Y y(3, arr3); + +// Likewise, the synthesized deduction guide should be preferred +// according to [over.match.class.deduct]p1. +static_assert(y.size == 4); + +// Dumping GH98592::: +// FunctionTemplateDecl 0x{{.+}} <{{.+}}> col:13 implicit +// |-TemplateTypeParmDecl 0x{{.+}} <{{.+}}> col:17 referenced class depth 0 index 0 T +// |-TemplateTypeParmDecl 0x{{.+}} <{{.+}}> col:19 class depth 0 index 1 U +// |-CXXDeductionGuideDecl 0x{{.+}} <{{.+}}> col:13 implicit 'auto (T, U (&)[3]) -> Y' +// | |-ParmVarDecl 0x{{.+}} col:16 'T' +// | |-ParmVarDecl 0x{{.+}} col:21 'U (&)[3]' +// | `-ConceptSpecializationExpr 0x{{.+}} <{{.+}}> 'bool' Concept 0x{{.+}} 'True' +// | |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28 +// | | `-TemplateArgument type 'type-parameter-0-0' +// | | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0 +// | `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0' +// | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 +// | `-TemplateTypeParm 0x{{.+}} 'T' +// `-CXXDeductionGuideDecl 0x{{.+}} <{{.+}}> col:13 implicit used 'auto (int, double (&)[3]) -> GH98592::Y' implicit_instantiation +// |-TemplateArgument type 'int' +// | `-BuiltinType 0x{{.+}} 'int' +// |-TemplateArgument type 'double' +// | `-BuiltinType 0x{{.+}} 'double' +// |-ParmVarDecl 0x{{.+}} col:16 'int' +// |-ParmVarDecl 0x{{.+}} col:21 'double (&)[3]' +// `-ConceptSpecializationExpr 0x{{.+}} <{{.+}}> 'bool' Concept 0x{{.+}} 'True' +// |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28 +// | `-TemplateArgument type 'type-parameter-0-0' +// | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0 +// `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0' +// `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 +// `-TemplateTypeParm 0x{{.+}} 'T' + +} // namespce GH98592 + namespace GH122134 { template diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 564502c1f3e92f..472f4fbd975378 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -15611,7 +15611,7 @@

C++ defect report implementation status

2628 DRWP Implicit deduction guides should propagate constraints - No + Clang 20 2629 From 652ff20140d79544db4dfa21314fc62c3c9182e5 Mon Sep 17 00:00:00 2001 From: Renaud Kauffmann Date: Wed, 22 Jan 2025 19:06:28 -0800 Subject: [PATCH 079/208] [flang][cuda] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect (#123840) With these changes, CUF atomic operations are handled as cudadevice intrinsics and are converted straight to the LLVM dialect with the `llvm.atomicrw` operation. I am only submitting changes for `atomicadd` to gather feedback. If we are to proceed with these changes I will add support for all other applicable atomic operations following this pattern. --- .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 26 ++++++++++++++++- flang/module/cudadevice.f90 | 28 ++++++++++++++++++- flang/test/Lower/CUDA/cuda-device-proc.cuf | 13 +++++++++ .../Semantics/cuf-device-procedures01.cuf | 9 ++++++ 5 files changed, 75 insertions(+), 2 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 9c9c0609f4fc3c..e2ea89483ef11f 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -185,6 +185,7 @@ struct IntrinsicLibrary { mlir::Value genAnint(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef); mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef); + mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genCommandArgumentCount(mlir::Type, llvm::ArrayRef); mlir::Value genAsind(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 6a343645ab8786..63c013dda95e64 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -44,6 +44,7 @@ #include "flang/Runtime/iostat-consts.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" @@ -51,7 +52,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include // temporary -- only used in genIeeeGetOrSetModesOrStatus -#include #include #define DEBUG_TYPE "flang-lower-intrinsic" @@ -147,6 +147,10 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, + {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"bessel_jn", &I::genBesselJn, {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}}, @@ -2574,6 +2578,26 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType, return builder.create(loc, atan, factor); } +static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, + mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, + mlir::Value arg1) { + auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + arg0 = builder.createConvert(loc, llvmPointerType, arg0); + return builder.create( + loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst); +} + +mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::add + : mlir::LLVM::AtomicBinOp::fadd; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + // ASSOCIATED fir::ExtendedValue IntrinsicLibrary::genAssociated(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 3d487fd000a094..53b6beaaf1ad8f 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -92,5 +92,31 @@ attributes(device) subroutine threadfence_system() end function end interface public :: __fadd_ru - + + ! Atomic Operations + + interface atomicadd + attributes(device) pure integer function atomicaddi(address, val) + !dir$ ignore_tkr (d) address, (d) val + integer, intent(inout) :: address + integer, value :: val + end function + attributes(device) pure real function atomicaddf(address, val) + !dir$ ignore_tkr (d) address, (d) val + real, intent(inout) :: address + real, value :: val + end function + attributes(device) pure real*8 function atomicaddd(address, val) + !dir$ ignore_tkr (d) address, (d) val + real*8, intent(inout) :: address + real*8, value :: val + end function + attributes(device) pure integer(8) function atomicaddl(address, val) + !dir$ ignore_tkr (d) address, (d) val + integer(8), intent(inout) :: address + integer(8), value :: val + end function + end interface +public :: atomicadd + end module diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 2042bbbe19650a..661e5728bf85b8 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -5,6 +5,10 @@ attributes(global) subroutine devsub() implicit none integer :: ret + real(4) :: af + real(8) :: ad + integer(4) :: ai + integer(8) :: al call syncthreads() call syncwarp(1) @@ -14,6 +18,11 @@ attributes(global) subroutine devsub() ret = syncthreads_and(1) ret = syncthreads_count(1) ret = syncthreads_or(1) + + ai = atomicadd(ai, 1_4) + al = atomicadd(al, 1_8) + af = atomicadd(af, 1.0_4) + ad = atomicadd(ad, 1.0_8) end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} @@ -25,6 +34,10 @@ end ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath : (i32) -> i32 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath : (i32) -> i32 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath : (i32) -> i32 +! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 +! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 +! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 +! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 ! CHECK: func.func private @llvm.nvvm.barrier0() ! CHECK: func.func private @__syncwarp(!fir.ref {cuf.data_attr = #cuf.cuda}) attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs} diff --git a/flang/test/Semantics/cuf-device-procedures01.cuf b/flang/test/Semantics/cuf-device-procedures01.cuf index b9918d8a4ae4ce..92ee02bb3c64df 100644 --- a/flang/test/Semantics/cuf-device-procedures01.cuf +++ b/flang/test/Semantics/cuf-device-procedures01.cuf @@ -28,8 +28,17 @@ end ! CHECK: threadfence_system (Subroutine): Use from threadfence_system in cudadevice subroutine host() + real(4) :: af + real(8) :: ad + integer(4) :: ai + integer(8) :: al call syncthreads() + ai = atomicadd(ai, 1_4) + al = atomicadd(al, 1_8) + af = atomicadd(af, 1.0_4) + ad = atomicadd(ad, 1.0_8) end subroutine ! CHECK-LABEL: Subprogram scope: host +! CHECK: atomicadd, EXTERNAL: HostAssoc{{$}} ! CHECK: syncthreads, EXTERNAL: HostAssoc{{$}} From 892a804d93d44ddfd7cd351852fe6aef32d4dcd0 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Wed, 22 Jan 2025 19:37:11 -0800 Subject: [PATCH 080/208] [NVPTX] Stop using 16-bit CAS instructions from PTX (#120220) Increases minimum CAS size from 16 bit to 32 bit, for better SASS codegen. When atomics are emulated using atom.cas.b16, the SASS generated includes 2 (nested) emulation loops. When emulated using an atom.cas.b32 loop, the SASS too has a single emulation loop. Using 32 bit CAS thus results in better codegen. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 2 + llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 8 +- llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 122 +++++++++++++------- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 102 ++++++++++------ 5 files changed, 153 insertions(+), 83 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ed493d50712a2a..773c97f7b4dc0f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -995,7 +995,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // actions computeRegisterProperties(STI.getRegisterInfo()); - setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32); + setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits()); setMaxAtomicSizeInBitsSupported(64); setMaxDivRemBitWidthSupported(64); } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 3b5c28e357e0cc..919f487c701416 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -128,6 +128,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // set of equivalent memory operations with a scalar data-type, executed in // an unspecified order on the elements in the vector. unsigned getMaxRequiredAlignment() const { return 8; } + // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS + unsigned getMinCmpXchgSizeInBits() const { return 32; } unsigned getPTXVersion() const { return PTXVersion; } diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index b76b3e59e9e6d0..b180928af82a48 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -134,10 +134,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX62-NEXT: ret; - %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst - %r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst - %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst - %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst + %r1 = atomicrmw fadd ptr %dp0, half %val monotonic + %r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic + %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic + %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic ret void } diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index 67552b95e04915..67abfe8295a623 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -46,59 +46,101 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-LABEL: test( ; CHECKPTX71: { ; CHECKPTX71-NEXT: .reg .pred %p<5>; -; CHECKPTX71-NEXT: .reg .b16 %rs<26>; -; CHECKPTX71-NEXT: .reg .b32 %r<4>; +; CHECKPTX71-NEXT: .reg .b16 %rs<14>; +; CHECKPTX71-NEXT: .reg .b32 %r<58>; ; CHECKPTX71-EMPTY: ; CHECKPTX71-NEXT: // %bb.0: -; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3]; -; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2]; -; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1]; -; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0]; -; CHECKPTX71-NEXT: ld.b16 %rs22, [%r1]; -; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14 +; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3]; +; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2]; +; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1]; +; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0]; +; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4; +; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3; +; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3; +; CHECKPTX71-NEXT: mov.b32 %r26, 65535; +; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; +; CHECKPTX71-NEXT: not.b32 %r3, %r27; +; CHECKPTX71-NEXT: ld.u32 %r54, [%r1]; +; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: mov.b16 %rs14, 0x3F80; -; CHECKPTX71-NEXT: fma.rn.bf16 %rs15, %rs22, %rs14, %rs13; -; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs22, %rs15; -; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs22; -; CHECKPTX71-NEXT: mov.u16 %rs22, %rs3; +; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28; +; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80; +; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1; +; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4; +; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; +; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; +; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; +; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54; +; CHECKPTX71-NEXT: mov.u32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; -; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13 -; CHECKPTX71-NEXT: ld.b16 %rs23, [%r1]; -; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8 +; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44 +; CHECKPTX71-NEXT: ld.u32 %r55, [%r1]; +; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: mov.b16 %rs16, 0x3F80; -; CHECKPTX71-NEXT: fma.rn.bf16 %rs17, %rs23, %rs16, %rs16; -; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs23, %rs17; -; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs23; -; CHECKPTX71-NEXT: mov.u16 %rs23, %rs6; +; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33; +; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80; +; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6; +; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7; +; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; +; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; +; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; +; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55; +; CHECKPTX71-NEXT: mov.u32 %r55, %r9; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; -; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7 -; CHECKPTX71-NEXT: ld.global.b16 %rs24, [%r2]; -; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2 +; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26 +; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4; +; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3; +; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24; +; CHECKPTX71-NEXT: mov.b32 %r39, 65535; +; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11; +; CHECKPTX71-NEXT: not.b32 %r12, %r40; +; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10]; +; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: mov.b16 %rs18, 0x3F80; -; CHECKPTX71-NEXT: fma.rn.bf16 %rs19, %rs24, %rs18, %rs13; -; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs24, %rs19; -; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs24; -; CHECKPTX71-NEXT: mov.u16 %rs24, %rs9; +; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41; +; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80; +; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1; +; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10; +; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; +; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; +; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; +; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56; +; CHECKPTX71-NEXT: mov.u32 %r56, %r15; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; -; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1 -; CHECKPTX71-NEXT: ld.shared.b16 %rs25, [%r3]; +; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8 +; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4; +; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3; +; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24; +; CHECKPTX71-NEXT: mov.b32 %r47, 65535; +; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17; +; CHECKPTX71-NEXT: not.b32 %r18, %r48; +; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16]; ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: mov.b16 %rs20, 0x3F80; -; CHECKPTX71-NEXT: fma.rn.bf16 %rs21, %rs25, %rs20, %rs13; -; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs25, %rs21; -; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs25; -; CHECKPTX71-NEXT: mov.u16 %rs25, %rs12; +; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49; +; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80; +; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1; +; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13; +; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; +; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; +; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; +; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57; +; CHECKPTX71-NEXT: mov.u32 %r57, %r21; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX71-NEXT: ret; - %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst - %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst - %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst - %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val seq_cst + %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic + %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic + %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic + %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic ret void } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 608dbb3a0ba732..33a1f15c6a5cd6 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -53,43 +53,44 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-LABEL: relaxed_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<17>; -; SM70-NEXT: .reg .b32 %r<3>; -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs9, [relaxed_sys_i8_param_2]; +; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; ; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -2; -; SM70-NEXT: ld.param.u8 %rs10, [relaxed_sys_i8_param_1]; -; SM70-NEXT: and.b64 %rd3, %rd2, 1; -; SM70-NEXT: shl.b64 %rd4, %rd3, 3; -; SM70-NEXT: cvt.u32.u64 %r1, %rd4; -; SM70-NEXT: mov.b16 %rs11, 255; -; SM70-NEXT: shl.b16 %rs12, %rs11, %r1; -; SM70-NEXT: not.b16 %rs2, %rs12; -; SM70-NEXT: shl.b16 %rs3, %rs9, %r1; -; SM70-NEXT: shl.b16 %rs4, %rs10, %r1; -; SM70-NEXT: ld.u16 %rs13, [%rd1]; -; SM70-NEXT: and.b16 %rs16, %rs13, %rs2; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b16 %rs14, %rs16, %rs3; -; SM70-NEXT: or.b16 %rs15, %rs16, %rs4; -; SM70-NEXT: atom.cas.b16 %rs7, [%rd1], %rs15, %rs14; -; SM70-NEXT: setp.eq.s16 %p1, %rs7, %rs15; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM70-NEXT: and.b16 %rs8, %rs7, %rs2; -; SM70-NEXT: setp.ne.s16 %p2, %rs16, %rs8; -; SM70-NEXT: mov.u16 %rs16, %rs8; +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: cvt.u32.u16 %r2, %rs9; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } @@ -137,19 +138,44 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; ; SM70-LABEL: relaxed_sys_i16( ; SM70: { -; SM70-NEXT: .reg .b16 %rs<4>; -; SM70-NEXT: .reg .b32 %r<2>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i16_param_0]; -; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_1]; -; SM70-NEXT: ld.param.u16 %rs2, [relaxed_sys_i16_param_2]; -; SM70-NEXT: atom.cas.b16 %rs3, [%rd1], %rs1, %rs2; -; SM70-NEXT: cvt.u32.u16 %r1, %rs2; -; SM70-NEXT: st.param.b32 [func_retval0], %r1; +; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB1_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB1_1; +; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic ret i16 %new } @@ -180,7 +206,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic ret i32 %new } @@ -209,7 +235,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic ret i64 %new } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: From 1c5d971e425ff080dffd4d9a9a7734ead042d323 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 22 Jan 2025 19:59:24 -0800 Subject: [PATCH 081/208] [Signals] Exclude dladdr for AIX after #123879 Widely supported but missing on AIX https://www.austingroupbugs.net/view.php?id=993 --- llvm/lib/Support/Unix/Signals.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index b840ec69c08a73..330b5d26fa50be 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -812,7 +812,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) { OS << "Stack dump without symbol names (ensure you have llvm-symbolizer in " "your PATH or set the environment var `LLVM_SYMBOLIZER_PATH` to point " "to it):\n"; -#if HAVE_DLOPEN +#if HAVE_DLOPEN && !defined(_AIX) int width = 0; for (int i = 0; i < depth; ++i) { Dl_info dlinfo; From 75750722737e9128500b81363ba66c62fea1e4fe Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 23 Jan 2025 04:04:05 +0000 Subject: [PATCH 082/208] [llvm-exegesis] Disable exhaustive tests on Windows When looking at the slowest lit tests, I'm seeing these four tests take two to eight minutes. Test coverage on Linux should be sufficient for the functionality on top of it not really being useful on Windows at all. This was observed when hacking on the new premerge in a windows VM. --- .../inverse_throughput-prepare-all-snippets-exhaustively.s | 5 +++++ .../inverse_throughput-prepare-all-snippets.s | 5 +++++ .../X86/uops/uops-prepare-all-snippets-exhaustively.s | 5 +++++ .../tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets.s | 5 +++++ 4 files changed, 20 insertions(+) diff --git a/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets-exhaustively.s b/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets-exhaustively.s index 1e9c5c34115bbe..2e982f47835eca 100644 --- a/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets-exhaustively.s +++ b/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets-exhaustively.s @@ -1,3 +1,8 @@ +# Only run this on Linux. Running on Windows can take an exorbinant amount of +# time (upwards of ten minutes), and the only place where this functionality is +# really useful is Linux. +# REQUIRES: x86_64-linux + # RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mode=inverse_throughput -opcode-index=-1 --max-configs-per-opcode=1048576 --benchmark-phase=prepare-snippet --benchmarks-file=- # FIXME: it would be good to check how many snippets we end up producing, # but the number is unstable, so for now just check that we do not crash. diff --git a/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets.s b/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets.s index 538555ed809c08..084025acd49819 100644 --- a/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets.s +++ b/llvm/test/tools/llvm-exegesis/X86/inverse_throughput/inverse_throughput-prepare-all-snippets.s @@ -1,3 +1,8 @@ +# Only run this on Linux. Running on Windows can take an exorbinant amount of +# time (upwards of ten minutes), and the only place where this functionality is +# really useful is Linux. +# REQUIRES: x86_64-linux + # RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mode=inverse_throughput -opcode-index=-1 --max-configs-per-opcode=1 --benchmark-phase=prepare-snippet --benchmarks-file=- # FIXME: it would be good to check how many snippets we end up producing, # but the number is unstable, so for now just check that we do not crash. diff --git a/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets-exhaustively.s b/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets-exhaustively.s index eeecd2e8196f46..bb9af9abbc09a2 100644 --- a/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets-exhaustively.s +++ b/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets-exhaustively.s @@ -1,3 +1,8 @@ +# Only run this on Linux. Running on Windows can take an exorbinant amount of +# time (upwards of ten minutes), and the only place where this functionality is +# really useful is Linux. +# REQUIRES: x86_64-linux + # RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mode=uops -opcode-index=-1 --max-configs-per-opcode=1048576 --benchmark-phase=prepare-snippet --benchmarks-file=- # FIXME: it would be good to check how many snippets we end up producing, # but the number is unstable, so for now just check that we do not crash. diff --git a/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets.s b/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets.s index d0a54e2476b166..d8ce870be1d60d 100644 --- a/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets.s +++ b/llvm/test/tools/llvm-exegesis/X86/uops/uops-prepare-all-snippets.s @@ -1,3 +1,8 @@ +# Only run this on Linux. Running on Windows can take an exorbinant amount of +# time (upwards of ten minutes), and the only place where this functionality is +# really useful is Linux. +# REQUIRES: x86_64-linux + # RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mode=uops -opcode-index=-1 --max-configs-per-opcode=1 --benchmark-phase=prepare-snippet --benchmarks-file=- # FIXME: it would be good to check how many snippets we end up producing, # but the number is unstable, so for now just check that we do not crash. From 19834b4623fd1e7ae5185ed76031b407c3fa7a47 Mon Sep 17 00:00:00 2001 From: tangaac Date: Thu, 23 Jan 2025 12:11:07 +0800 Subject: [PATCH 083/208] [LoongArch] Support sc.q instruction for 128bit cmpxchg operation (#116771) Two options for clang -mno-scq: Disable sc.q instruction. -mscq: Enable sc.q instruction. The default is -mno-scq. --- clang/include/clang/Driver/Options.td | 4 + clang/lib/Basic/Targets/LoongArch.cpp | 7 +- clang/lib/Basic/Targets/LoongArch.h | 2 + .../lib/Driver/ToolChains/Arch/LoongArch.cpp | 2 + clang/test/Driver/loongarch-march.c | 8 +- clang/test/Driver/loongarch-mscq.c | 30 ++ clang/test/Preprocessor/init-loongarch.c | 31 +- .../TargetParser/LoongArchTargetParser.def | 3 +- .../llvm/TargetParser/LoongArchTargetParser.h | 3 + llvm/lib/Target/LoongArch/LoongArch.td | 9 +- .../LoongArchExpandAtomicPseudoInsts.cpp | 110 ++++++ .../LoongArch/LoongArchISelLowering.cpp | 46 +++ .../Target/LoongArch/LoongArchInstrInfo.td | 14 + llvm/lib/TargetParser/Host.cpp | 2 +- .../TargetParser/LoongArchTargetParser.cpp | 1 + .../ir-instruction/atomic-cmpxchg-128.ll | 352 ++++++++++++++++++ 16 files changed, 605 insertions(+), 19 deletions(-) create mode 100644 clang/test/Driver/loongarch-mscq.c create mode 100644 llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg-128.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 852051e772fc1c..df705104d9ea31 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5474,6 +5474,10 @@ def mdiv32 : Flag<["-"], "mdiv32">, Group, HelpText<"Use div.w[u] and mod.w[u] instructions with input not sign-extended.">; def mno_div32 : Flag<["-"], "mno-div32">, Group, HelpText<"Do not use div.w[u] and mod.w[u] instructions with input not sign-extended.">; +def mscq : Flag<["-"], "mscq">, Group, + HelpText<"Enable sc.q instruction.">; +def mno_scq : Flag<["-"], "mno-scq">, Group, + HelpText<"Disable sc.q instruction.">; def mannotate_tablejump : Flag<["-"], "mannotate-tablejump">, Group, HelpText<"Enable annotate table jump instruction to correlate it with the jump table.">; def mno_annotate_tablejump : Flag<["-"], "mno-annotate-tablejump">, Group, diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp index d36186aa9c2fbf..bb0d0b68cfcb0a 100644 --- a/clang/lib/Basic/Targets/LoongArch.cpp +++ b/clang/lib/Basic/Targets/LoongArch.cpp @@ -206,7 +206,7 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts, // arch feature set will be used to include all sub-features belonging to // the V1.1 ISA version. if (HasFeatureFrecipe && HasFeatureLAM_BH && HasFeatureLAMCAS && - HasFeatureLD_SEQ_SA && HasFeatureDiv32) + HasFeatureLD_SEQ_SA && HasFeatureDiv32 && HasFeatureSCQ) Builder.defineMacro("__loongarch_arch", Twine('"') + "la64v1.1" + Twine('"')); else @@ -249,6 +249,9 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts, if (HasFeatureDiv32) Builder.defineMacro("__loongarch_div32", Twine(1)); + if (HasFeatureSCQ) + Builder.defineMacro("__loongarch_scq", Twine(1)); + StringRef ABI = getABI(); if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s") Builder.defineMacro("__loongarch_lp64"); @@ -333,6 +336,8 @@ bool LoongArchTargetInfo::handleTargetFeatures( HasFeatureLD_SEQ_SA = true; else if (Feature == "+div32") HasFeatureDiv32 = true; + else if (Feature == "+scq") + HasFeatureSCQ = true; } return true; } diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h index abaa05aa42d438..5c34c84ff8d3e8 100644 --- a/clang/lib/Basic/Targets/LoongArch.h +++ b/clang/lib/Basic/Targets/LoongArch.h @@ -34,6 +34,7 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo { bool HasFeatureLAMCAS; bool HasFeatureLD_SEQ_SA; bool HasFeatureDiv32; + bool HasFeatureSCQ; public: LoongArchTargetInfo(const llvm::Triple &Triple, const TargetOptions &) @@ -47,6 +48,7 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo { HasFeatureLAMCAS = false; HasFeatureLD_SEQ_SA = false; HasFeatureDiv32 = false; + HasFeatureSCQ = false; LongDoubleWidth = 128; LongDoubleAlign = 128; LongDoubleFormat = &llvm::APFloat::IEEEquad(); diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index e36272d2083206..0575a1ebef3a6c 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -286,6 +286,8 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, options::OPT_mno_ld_seq_sa, "ld-seq-sa"); AddTargetFeature(Args, Features, options::OPT_mdiv32, options::OPT_mno_div32, "div32"); + AddTargetFeature(Args, Features, options::OPT_mscq, options::OPT_mno_scq, + "scq"); } std::string loongarch::postProcessTargetCPUString(const std::string &CPU, diff --git a/clang/test/Driver/loongarch-march.c b/clang/test/Driver/loongarch-march.c index cfcfa852efea58..b52cdb330716ff 100644 --- a/clang/test/Driver/loongarch-march.c +++ b/clang/test/Driver/loongarch-march.c @@ -39,21 +39,21 @@ // CC1-LA64V1P1: "-target-cpu" "loongarch64" // CC1-LA64V1P1-NOT: "-target-feature" -// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+lamcas" "-target-feature" "+ld-seq-sa" "-target-feature" "+div32" +// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+lamcas" "-target-feature" "+ld-seq-sa" "-target-feature" "+div32" "-target-feature" "+scq" // CC1-LA64V1P1-NOT: "-target-feature" // CC1-LA64V1P1: "-target-abi" "lp64d" // CC1-LA664: "-target-cpu" "la664" // CC1-LA664-NOT: "-target-feature" -// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+lamcas" "-target-feature" "+ld-seq-sa" "-target-feature" "+div32" +// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+lamcas" "-target-feature" "+ld-seq-sa" "-target-feature" "+div32" "-target-feature" "+scq" // CC1-LA664-NOT: "-target-feature" // CC1-LA664: "-target-abi" "lp64d" // IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+f,+ual" // IR-LA464: attributes #[[#]] ={{.*}}"target-cpu"="la464" {{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual" // IR-LA64V1P0: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+lsx,+ual" -// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+div32,+frecipe,+lam-bh,+lamcas,+ld-seq-sa,+lsx,+ual" -// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+div32,+f,+frecipe,+lam-bh,+lamcas,+lasx,+ld-seq-sa,+lsx,+ual" +// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+div32,+frecipe,+lam-bh,+lamcas,+ld-seq-sa,+lsx,+scq,+ual" +// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+div32,+f,+frecipe,+lam-bh,+lamcas,+lasx,+ld-seq-sa,+lsx,+scq,+ual" int foo(void) { return 3; diff --git a/clang/test/Driver/loongarch-mscq.c b/clang/test/Driver/loongarch-mscq.c new file mode 100644 index 00000000000000..cd798ba5d8ff2b --- /dev/null +++ b/clang/test/Driver/loongarch-mscq.c @@ -0,0 +1,30 @@ +/// Test -m[no]scq options. + +// RUN: %clang --target=loongarch64 -mscq -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-SCQ +// RUN: %clang --target=loongarch64 -mno-scq -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-SCQ +// RUN: %clang --target=loongarch64 -mno-scq -mscq -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-SCQ +// RUN: %clang --target=loongarch64 -mscq -mno-scq -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-SCQ + +// RUN: %clang --target=loongarch64 -mscq -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-SCQ +// RUN: %clang --target=loongarch64 -mno-scq -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-NO-SCQ +// RUN: %clang --target=loongarch64 -mno-scq -mscq -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-SCQ +// RUN: %clang --target=loongarch64 -mscq -mno-scq -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-NO-SCQ + + +// CC1-SCQ: "-target-feature" "+scq" +// CC1-NO-SCQ: "-target-feature" "-scq" + +// IR-SCQ: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+scq{{(,.*)?}}" +// IR-NO-SCQ: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-scq{{(,.*)?}}" + +int foo(void) { + return 42; +} \ No newline at end of file diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c index 19458a2b14f40c..f6fd603dc39c0b 100644 --- a/clang/test/Preprocessor/init-loongarch.c +++ b/clang/test/Preprocessor/init-loongarch.c @@ -798,7 +798,7 @@ // LA64-FPU0-LP64S-NOT: #define __loongarch_single_float // LA64-FPU0-LP64S: #define __loongarch_soft_float 1 -/// Check __loongarch_arch{_tune/_frecipe/_lam_bh/_lamcas/_ld_seq_sa/_div32}. +/// Check __loongarch_arch{_tune/_frecipe/_lam_bh/_lamcas/_ld_seq_sa/_div32/_scq}. // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - | \ // RUN: FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s @@ -823,11 +823,11 @@ // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx | \ // RUN: FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32 -DARCH=la64v1.1 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32,SCQ -DARCH=la64v1.1 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -frecipe | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32,SCQ -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lsx | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32 -DARCH=loongarch64 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32,SCQ -DARCH=loongarch64 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +frecipe | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \ @@ -835,7 +835,7 @@ // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +lam-bh | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lam-bh | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAMCAS,LD-SEQ-SA,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAMCAS,LD-SEQ-SA,DIV32,SCQ -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lam-bh | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH -DARCH=loongarch64 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +lam-bh | \ @@ -843,7 +843,7 @@ // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +lamcas | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAMCAS -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lamcas | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA,DIV32,SCQ -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lamcas | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAMCAS -DARCH=loongarch64 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +lamcas | \ @@ -851,7 +851,7 @@ // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +ld-seq-sa | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LD-SEQ-SA -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -ld-seq-sa | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,DIV32,SCQ -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +ld-seq-sa | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LD-SEQ-SA -DARCH=loongarch64 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +ld-seq-sa | \ @@ -859,21 +859,29 @@ // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +div32 | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -div32| \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,SCQ -DARCH=la64v1.0 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +div32 | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,DIV32 -DARCH=loongarch64 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +div32 | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s -// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe -Xclang -target-feature -Xclang +lam-bh -Xclang -target-feature -Xclang +lamcas -Xclang -target-feature -Xclang +ld-seq-sa -Xclang -target-feature -Xclang +div32 | \ +// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +scq | \ +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,SCQ -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -scq | \ +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +scq | \ +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,SCQ -DARCH=loongarch64 -DTUNE=loongarch64 %s +// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +scq | \ +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,SCQ -DARCH=la64v1.0 -DTUNE=loongarch64 %s +// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe -Xclang -target-feature -Xclang +lam-bh -Xclang -target-feature -Xclang +lamcas -Xclang -target-feature -Xclang +ld-seq-sa -Xclang -target-feature -Xclang +div32 -Xclang -target-feature -Xclang +scq | \ // RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE -DARCH=la64v1.1 -DTUNE=loongarch64 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32 -DARCH=la664 -DTUNE=la664 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32,SCQ -DARCH=la664 -DTUNE=la664 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \ // RUN: FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la664 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \ // RUN: FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \ -// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32 -DARCH=la664 -DTUNE=loongarch64 %s +// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LAMCAS,LD-SEQ-SA,DIV32,SCQ -DARCH=la664 -DTUNE=loongarch64 %s // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]" // DIV32: #define __loongarch_div32 1 @@ -881,6 +889,7 @@ // LAM-BH: #define __loongarch_lam_bh 1 // LAMCAS: #define __loongarch_lamcas 1 // LD-SEQ-SA: #define __loongarch_ld_seq_sa 1 +// SCQ: #define __loongarch_scq 1 // ARCH-TUNE: #define __loongarch_tune "[[TUNE]]" // RUN: %clang --target=loongarch64 -mlsx -x c -E -dM %s -o - \ diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def index 6731a2c975cd54..1bcf65b37f201e 100644 --- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def +++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def @@ -15,6 +15,7 @@ LOONGARCH_FEATURE("+lam-bh", FK_LAM_BH) LOONGARCH_FEATURE("+lamcas", FK_LAMCAS) LOONGARCH_FEATURE("+ld-seq-sa", FK_LD_SEQ_SA) LOONGARCH_FEATURE("+div32", FK_DIV32) +LOONGARCH_FEATURE("+scq", FK_SCQ) #undef LOONGARCH_FEATURE @@ -24,6 +25,6 @@ LOONGARCH_FEATURE("+div32", FK_DIV32) LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL) LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL) -LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE | FK_LAM_BH | FK_LAMCAS | FK_LD_SEQ_SA | FK_DIV32) +LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE | FK_LAM_BH | FK_LAMCAS | FK_LD_SEQ_SA | FK_DIV32 | FK_SCQ) #undef LOONGARCH_ARCH diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h index 52cd51f43ad640..e08e7bc182e112 100644 --- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h +++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h @@ -63,6 +63,9 @@ enum FeatureKind : uint32_t { // Assume div.w[u] and mod.w[u] can handle inputs that are not sign-extended. FK_DIV32 = 1 << 13, + + // sc.q is available. + FK_SCQ = 1 << 14, }; struct FeatureInfo { diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td index 596c8c90c0a1f6..5fd52babfc6ec3 100644 --- a/llvm/lib/Target/LoongArch/LoongArch.td +++ b/llvm/lib/Target/LoongArch/LoongArch.td @@ -135,6 +135,12 @@ def FeatureDiv32 "Assume div.w[u] and mod.w[u] can handle inputs that are not sign-extended">; def HasDiv32 : Predicate<"Subtarget->hasDiv32()">; +// Support SC.Q instruction +def FeatureSCQ + : SubtargetFeature<"scq", "HasSCQ", "true", + "Support sc.q instruction">; +def HasSCQ : Predicate<"Subtarget->hasSCQ()">; + def TunePreferWInst : SubtargetFeature<"prefer-w-inst", "PreferWInst", "true", "Prefer instructions with W suffix">; @@ -180,7 +186,8 @@ def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit, FeatureLAM_BH, FeatureLAMCAS, FeatureLD_SEQ_SA, - FeatureDiv32]>; + FeatureDiv32, + FeatureSCQ]>; //===----------------------------------------------------------------------===// // Define the LoongArch target. diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp index 35f84425cb0eba..79f37a0f548c69 100644 --- a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp @@ -58,6 +58,9 @@ class LoongArchExpandAtomicPseudo : public MachineFunctionPass { bool expandAtomicCmpXchg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI); + bool expandAtomicCmpXchg128(MachineBasicBlock &MBB, + MachineBasicBlock::iterator, + MachineBasicBlock::iterator &NextMBBI); }; char LoongArchExpandAtomicPseudo::ID = 0; @@ -131,6 +134,9 @@ bool LoongArchExpandAtomicPseudo::expandMI( return expandAtomicCmpXchg(MBB, MBBI, false, 32, NextMBBI); case LoongArch::PseudoCmpXchg64: return expandAtomicCmpXchg(MBB, MBBI, false, 64, NextMBBI); + case LoongArch::PseudoCmpXchg128: + case LoongArch::PseudoCmpXchg128Acquire: + return expandAtomicCmpXchg128(MBB, MBBI, NextMBBI); case LoongArch::PseudoMaskedCmpXchg32: return expandAtomicCmpXchg(MBB, MBBI, true, 32, NextMBBI); case LoongArch::PseudoMaskedAtomicLoadMax32: @@ -604,6 +610,110 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg( return true; } +bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg128( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto TailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + // Insert new MBBs + MF->insert(++MBB.getIterator(), LoopHeadMBB); + MF->insert(++LoopHeadMBB->getIterator(), LoopTailMBB); + MF->insert(++LoopTailMBB->getIterator(), TailMBB); + MF->insert(++TailMBB->getIterator(), DoneMBB); + + // Set up successors and transfer remaining instructions to DoneMBB. + LoopHeadMBB->addSuccessor(LoopTailMBB); + LoopHeadMBB->addSuccessor(TailMBB); + LoopTailMBB->addSuccessor(DoneMBB); + LoopTailMBB->addSuccessor(LoopHeadMBB); + TailMBB->addSuccessor(DoneMBB); + DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end()); + DoneMBB->transferSuccessors(&MBB); + MBB.addSuccessor(LoopHeadMBB); + + Register DestLoReg = MI.getOperand(0).getReg(); + Register DestHiReg = MI.getOperand(1).getReg(); + Register ScratchReg = MI.getOperand(2).getReg(); + Register AddrReg = MI.getOperand(3).getReg(); + Register CmpValLoReg = MI.getOperand(4).getReg(); + Register CmpValHiReg = MI.getOperand(5).getReg(); + Register NewValLoReg = MI.getOperand(6).getReg(); + Register NewValHiReg = MI.getOperand(7).getReg(); + + // .loophead: + // ll.d res_lo, (addr) + // dbar acquire + // ld.d res_hi, (addr), 8 + // bne dest_lo, cmpval_lo, tail + // bne dest_hi, cmpval_hi, tail + BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::LL_D), DestLoReg) + .addReg(AddrReg) + .addImm(0); + BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::DBAR)).addImm(0b10100); + BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::LD_D), DestHiReg) + .addReg(AddrReg) + .addImm(8); + BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::BNE)) + .addReg(DestLoReg) + .addReg(CmpValLoReg) + .addMBB(TailMBB); + BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::BNE)) + .addReg(DestHiReg) + .addReg(CmpValHiReg) + .addMBB(TailMBB); + // .looptail: + // move scratch, newval_lo + // sc.q scratch, newval_hi, (addr) + // beqz scratch, loophead + // b done + BuildMI(LoopTailMBB, DL, TII->get(LoongArch::OR), ScratchReg) + .addReg(NewValLoReg) + .addReg(LoongArch::R0); + BuildMI(LoopTailMBB, DL, TII->get(LoongArch::SC_Q), ScratchReg) + .addReg(ScratchReg) + .addReg(NewValHiReg) + .addReg(AddrReg); + BuildMI(LoopTailMBB, DL, TII->get(LoongArch::BEQZ)) + .addReg(ScratchReg) + .addMBB(LoopHeadMBB); + BuildMI(LoopTailMBB, DL, TII->get(LoongArch::B)).addMBB(DoneMBB); + int hint; + + switch (MI.getOpcode()) { + case LoongArch::PseudoCmpXchg128Acquire: + // acquire acqrel seqcst + hint = 0b10100; + break; + case LoongArch::PseudoCmpXchg128: + hint = 0x700; + break; + default: + llvm_unreachable("Unexpected opcode"); + } + + // .tail: + // dbar 0x700 | acquire + if (!(hint == 0x700 && MF->getSubtarget().hasLD_SEQ_SA())) + BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(hint); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + + LivePhysRegs LiveRegs; + computeAndAddLiveIns(LiveRegs, *LoopHeadMBB); + computeAndAddLiveIns(LiveRegs, *LoopTailMBB); + computeAndAddLiveIns(LiveRegs, *TailMBB); + computeAndAddLiveIns(LiveRegs, *DoneMBB); + + return true; +} + } // end namespace INITIALIZE_PASS(LoongArchExpandAtomicPseudo, "loongarch-expand-atomic-pseudo", diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 9eb607c69a9526..2282dc8955613a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -386,6 +386,11 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, // cmpxchg sizes down to 8 bits become legal if LAMCAS is available. if (Subtarget.hasLAMCAS()) setMinCmpXchgSizeInBits(8); + + if (Subtarget.hasSCQ()) { + setMaxAtomicSizeInBitsSupported(128); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + } } bool LoongArchTargetLowering::isOffsetFoldingLegal( @@ -2906,6 +2911,43 @@ replaceINTRINSIC_WO_CHAINResults(SDNode *N, SmallVectorImpl &Results, } } +static void replaceCMP_XCHG_128Results(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) { + assert(N->getValueType(0) == MVT::i128 && + "AtomicCmpSwap on types less than 128 should be legal"); + MachineMemOperand *MemOp = cast(N)->getMemOperand(); + + unsigned Opcode; + switch (MemOp->getMergedOrdering()) { + case AtomicOrdering::Acquire: + case AtomicOrdering::AcquireRelease: + case AtomicOrdering::SequentiallyConsistent: + Opcode = LoongArch::PseudoCmpXchg128Acquire; + break; + case AtomicOrdering::Monotonic: + case AtomicOrdering::Release: + Opcode = LoongArch::PseudoCmpXchg128; + break; + default: + llvm_unreachable("Unexpected ordering!"); + } + + SDLoc DL(N); + auto CmpVal = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64); + auto NewVal = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64); + SDValue Ops[] = {N->getOperand(1), CmpVal.first, CmpVal.second, + NewVal.first, NewVal.second, N->getOperand(0)}; + + SDNode *CmpSwap = DAG.getMachineNode( + Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i64, MVT::Other), + Ops); + DAG.setNodeMemRefs(cast(CmpSwap), {MemOp}); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, + SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); + Results.push_back(SDValue(CmpSwap, 3)); +} + void LoongArchTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDLoc DL(N); @@ -3218,6 +3260,10 @@ void LoongArchTargetLowering::ReplaceNodeResults( Results.push_back(Result); break; } + case ISD::ATOMIC_CMP_SWAP: { + replaceCMP_XCHG_128Results(N, Results, DAG); + break; + } } } diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 62cb6fa1d88a8b..9b93a9f8247269 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -2035,6 +2035,20 @@ class PseudoCmpXchg def PseudoCmpXchg32 : PseudoCmpXchg; def PseudoCmpXchg64 : PseudoCmpXchg; +class PseudoCmpXchg128Pat + : Pseudo<(outs GPR:$res_lo, GPR:$res_hi, GPR:$scratch), + (ins GPR:$addr, GPR:$cmpval_lo, GPR:$cmpval_hi, + GPR:$newval_lo, GPR:$newval_hi)> { + let Constraints = "@earlyclobber $res_lo,@earlyclobber $res_hi,@earlyclobber $scratch"; + let mayLoad = 1; + let mayStore = 1; + let hasSideEffects = 0; + let Size = 36; +} + +def PseudoCmpXchg128 : PseudoCmpXchg128Pat; +def PseudoCmpXchg128Acquire : PseudoCmpXchg128Pat; + def PseudoMaskedCmpXchg32 : Pseudo<(outs GPR:$res, GPR:$scratch), (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index ba7032025150e5..fa57ae183bb84a 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -2135,12 +2135,12 @@ const StringMap sys::getHostCPUFeatures() { Features["div32"] = cpucfg2 & (1U << 26); // CPUCFG.2.DIV32 Features["lam-bh"] = cpucfg2 & (1U << 27); // CPUCFG.2.LAM_BH Features["lamcas"] = cpucfg2 & (1U << 28); // CPUCFG.2.LAMCAS + Features["scq"] = cpucfg2 & (1U << 30); // CPUCFG.2.SCQ Features["ld-seq-sa"] = cpucfg3 & (1U << 23); // CPUCFG.3.LD_SEQ_SA // TODO: Need to complete. // Features["llacq-screl"] = cpucfg2 & (1U << 29); // CPUCFG.2.LLACQ_SCREL - // Features["scq"] = cpucfg2 & (1U << 30); // CPUCFG.2.SCQ return Features; } #elif defined(__linux__) && defined(__riscv) diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp index c8a07c32247cdf..e394c0c15b207c 100644 --- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp +++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp @@ -56,6 +56,7 @@ bool LoongArch::getArchFeatures(StringRef Arch, Features.push_back("+lamcas"); Features.push_back("+ld-seq-sa"); Features.push_back("+div32"); + Features.push_back("+scq"); } return true; } diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg-128.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg-128.ll new file mode 100644 index 00000000000000..b731081386a2cb --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg-128.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 -mattr=+d,-scq,-ld-seq-sa < %s | FileCheck %s --check-prefix=LA64 +; RUN: llc --mtriple=loongarch64 -mattr=+d,+scq,-ld-seq-sa --verify-machineinstrs < %s | FileCheck %s --check-prefixes=LA64-SCQ,NO-LD-SEQ-SA +; RUN: llc --mtriple=loongarch64 -mattr=+d,+scq,+ld-seq-sa --verify-machineinstrs < %s | FileCheck %s --check-prefixes=LA64-SCQ,LD-SEQ-SA + +define void @cmpxchg_i128_acquire_acquire(ptr %ptr, i128 %cmp, i128 %val) nounwind { +; LA64-LABEL: cmpxchg_i128_acquire_acquire: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: move $a6, $a4 +; LA64-NEXT: st.d $a2, $sp, 8 +; LA64-NEXT: st.d $a1, $sp, 0 +; LA64-NEXT: addi.d $a1, $sp, 0 +; LA64-NEXT: ori $a4, $zero, 2 +; LA64-NEXT: ori $a5, $zero, 2 +; LA64-NEXT: move $a2, $a3 +; LA64-NEXT: move $a3, $a6 +; LA64-NEXT: bl %plt(__atomic_compare_exchange_16) +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret +; +; LA64-SCQ-LABEL: cmpxchg_i128_acquire_acquire: +; LA64-SCQ: # %bb.0: +; LA64-SCQ-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; LA64-SCQ-NEXT: ll.d $a5, $a0, 0 +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: ld.d $a6, $a0, 8 +; LA64-SCQ-NEXT: bne $a5, $a1, .LBB0_3 +; LA64-SCQ-NEXT: bne $a6, $a2, .LBB0_3 +; LA64-SCQ-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1 +; LA64-SCQ-NEXT: move $a7, $a3 +; LA64-SCQ-NEXT: sc.q $a7, $a4, $a0 +; LA64-SCQ-NEXT: beqz $a7, .LBB0_1 +; LA64-SCQ-NEXT: b .LBB0_4 +; LA64-SCQ-NEXT: .LBB0_3: +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: .LBB0_4: +; LA64-SCQ-NEXT: ret + %res = cmpxchg ptr %ptr, i128 %cmp, i128 %val acquire acquire + ret void +} + +define void @cmpxchg_i128_acquire_monotonic(ptr %ptr, i128 %cmp, i128 %val) nounwind { +; LA64-LABEL: cmpxchg_i128_acquire_monotonic: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: move $a5, $a4 +; LA64-NEXT: st.d $a2, $sp, 8 +; LA64-NEXT: st.d $a1, $sp, 0 +; LA64-NEXT: addi.d $a1, $sp, 0 +; LA64-NEXT: ori $a4, $zero, 2 +; LA64-NEXT: move $a2, $a3 +; LA64-NEXT: move $a3, $a5 +; LA64-NEXT: move $a5, $zero +; LA64-NEXT: bl %plt(__atomic_compare_exchange_16) +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret +; +; LA64-SCQ-LABEL: cmpxchg_i128_acquire_monotonic: +; LA64-SCQ: # %bb.0: +; LA64-SCQ-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; LA64-SCQ-NEXT: ll.d $a5, $a0, 0 +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: ld.d $a6, $a0, 8 +; LA64-SCQ-NEXT: bne $a5, $a1, .LBB1_3 +; LA64-SCQ-NEXT: bne $a6, $a2, .LBB1_3 +; LA64-SCQ-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 +; LA64-SCQ-NEXT: move $a7, $a3 +; LA64-SCQ-NEXT: sc.q $a7, $a4, $a0 +; LA64-SCQ-NEXT: beqz $a7, .LBB1_1 +; LA64-SCQ-NEXT: b .LBB1_4 +; LA64-SCQ-NEXT: .LBB1_3: +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: .LBB1_4: +; LA64-SCQ-NEXT: ret + %res = cmpxchg ptr %ptr, i128 %cmp, i128 %val acquire monotonic + ret void +} + +define i128 @cmpxchg_i128_acquire_acquire_reti128(ptr %ptr, i128 %cmp, i128 %val) nounwind { +; LA64-LABEL: cmpxchg_i128_acquire_acquire_reti128: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: move $a6, $a4 +; LA64-NEXT: st.d $a2, $sp, 8 +; LA64-NEXT: st.d $a1, $sp, 0 +; LA64-NEXT: addi.d $a1, $sp, 0 +; LA64-NEXT: ori $a4, $zero, 2 +; LA64-NEXT: ori $a5, $zero, 2 +; LA64-NEXT: move $a2, $a3 +; LA64-NEXT: move $a3, $a6 +; LA64-NEXT: bl %plt(__atomic_compare_exchange_16) +; LA64-NEXT: ld.d $a1, $sp, 8 +; LA64-NEXT: ld.d $a0, $sp, 0 +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret +; +; LA64-SCQ-LABEL: cmpxchg_i128_acquire_acquire_reti128: +; LA64-SCQ: # %bb.0: +; LA64-SCQ-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; LA64-SCQ-NEXT: ll.d $a5, $a0, 0 +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: ld.d $a6, $a0, 8 +; LA64-SCQ-NEXT: bne $a5, $a1, .LBB2_3 +; LA64-SCQ-NEXT: bne $a6, $a2, .LBB2_3 +; LA64-SCQ-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1 +; LA64-SCQ-NEXT: move $a7, $a3 +; LA64-SCQ-NEXT: sc.q $a7, $a4, $a0 +; LA64-SCQ-NEXT: beqz $a7, .LBB2_1 +; LA64-SCQ-NEXT: b .LBB2_4 +; LA64-SCQ-NEXT: .LBB2_3: +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: .LBB2_4: +; LA64-SCQ-NEXT: move $a0, $a5 +; LA64-SCQ-NEXT: move $a1, $a6 +; LA64-SCQ-NEXT: ret + %tmp = cmpxchg ptr %ptr, i128 %cmp, i128 %val acquire acquire + %res = extractvalue { i128, i1 } %tmp, 0 + ret i128 %res +} + +define i1 @cmpxchg_i128_acquire_acquire_reti1(ptr %ptr, i128 %cmp, i128 %val) nounwind { +; LA64-LABEL: cmpxchg_i128_acquire_acquire_reti1: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: move $a6, $a4 +; LA64-NEXT: st.d $a2, $sp, 8 +; LA64-NEXT: st.d $a1, $sp, 0 +; LA64-NEXT: addi.d $a1, $sp, 0 +; LA64-NEXT: ori $a4, $zero, 2 +; LA64-NEXT: ori $a5, $zero, 2 +; LA64-NEXT: move $a2, $a3 +; LA64-NEXT: move $a3, $a6 +; LA64-NEXT: bl %plt(__atomic_compare_exchange_16) +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret +; +; LA64-SCQ-LABEL: cmpxchg_i128_acquire_acquire_reti1: +; LA64-SCQ: # %bb.0: +; LA64-SCQ-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; LA64-SCQ-NEXT: ll.d $a5, $a0, 0 +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: ld.d $a6, $a0, 8 +; LA64-SCQ-NEXT: bne $a5, $a1, .LBB3_3 +; LA64-SCQ-NEXT: bne $a6, $a2, .LBB3_3 +; LA64-SCQ-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; LA64-SCQ-NEXT: move $a7, $a3 +; LA64-SCQ-NEXT: sc.q $a7, $a4, $a0 +; LA64-SCQ-NEXT: beqz $a7, .LBB3_1 +; LA64-SCQ-NEXT: b .LBB3_4 +; LA64-SCQ-NEXT: .LBB3_3: +; LA64-SCQ-NEXT: dbar 20 +; LA64-SCQ-NEXT: .LBB3_4: +; LA64-SCQ-NEXT: xor $a0, $a6, $a2 +; LA64-SCQ-NEXT: xor $a1, $a5, $a1 +; LA64-SCQ-NEXT: or $a0, $a1, $a0 +; LA64-SCQ-NEXT: sltui $a0, $a0, 1 +; LA64-SCQ-NEXT: ret + %tmp = cmpxchg ptr %ptr, i128 %cmp, i128 %val acquire acquire + %res = extractvalue { i128, i1 } %tmp, 1 + ret i1 %res +} + + +define void @cmpxchg_i128_monotonic_monotonic(ptr %ptr, i128 %cmp, i128 %val) nounwind { +; LA64-LABEL: cmpxchg_i128_monotonic_monotonic: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $a2, $sp, 8 +; LA64-NEXT: st.d $a1, $sp, 0 +; LA64-NEXT: addi.d $a1, $sp, 0 +; LA64-NEXT: move $a2, $a3 +; LA64-NEXT: move $a3, $a4 +; LA64-NEXT: move $a4, $zero +; LA64-NEXT: move $a5, $zero +; LA64-NEXT: bl %plt(__atomic_compare_exchange_16) +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret +; +; NO-LD-SEQ-SA-LABEL: cmpxchg_i128_monotonic_monotonic: +; NO-LD-SEQ-SA: # %bb.0: +; NO-LD-SEQ-SA-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; NO-LD-SEQ-SA-NEXT: ll.d $a5, $a0, 0 +; NO-LD-SEQ-SA-NEXT: dbar 20 +; NO-LD-SEQ-SA-NEXT: ld.d $a6, $a0, 8 +; NO-LD-SEQ-SA-NEXT: bne $a5, $a1, .LBB4_3 +; NO-LD-SEQ-SA-NEXT: bne $a6, $a2, .LBB4_3 +; NO-LD-SEQ-SA-NEXT: # %bb.2: # in Loop: Header=BB4_1 Depth=1 +; NO-LD-SEQ-SA-NEXT: move $a7, $a3 +; NO-LD-SEQ-SA-NEXT: sc.q $a7, $a4, $a0 +; NO-LD-SEQ-SA-NEXT: beqz $a7, .LBB4_1 +; NO-LD-SEQ-SA-NEXT: b .LBB4_4 +; NO-LD-SEQ-SA-NEXT: .LBB4_3: +; NO-LD-SEQ-SA-NEXT: dbar 1792 +; NO-LD-SEQ-SA-NEXT: .LBB4_4: +; NO-LD-SEQ-SA-NEXT: ret +; +; LD-SEQ-SA-LABEL: cmpxchg_i128_monotonic_monotonic: +; LD-SEQ-SA: # %bb.0: +; LD-SEQ-SA-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; LD-SEQ-SA-NEXT: ll.d $a5, $a0, 0 +; LD-SEQ-SA-NEXT: dbar 20 +; LD-SEQ-SA-NEXT: ld.d $a6, $a0, 8 +; LD-SEQ-SA-NEXT: bne $a5, $a1, .LBB4_3 +; LD-SEQ-SA-NEXT: bne $a6, $a2, .LBB4_3 +; LD-SEQ-SA-NEXT: # %bb.2: # in Loop: Header=BB4_1 Depth=1 +; LD-SEQ-SA-NEXT: move $a7, $a3 +; LD-SEQ-SA-NEXT: sc.q $a7, $a4, $a0 +; LD-SEQ-SA-NEXT: beqz $a7, .LBB4_1 +; LD-SEQ-SA-NEXT: b .LBB4_4 +; LD-SEQ-SA-NEXT: .LBB4_3: +; LD-SEQ-SA-NEXT: .LBB4_4: +; LD-SEQ-SA-NEXT: ret + %res = cmpxchg ptr %ptr, i128 %cmp, i128 %val monotonic monotonic + ret void +} + +define i128 @cmpxchg_i128_monotonic_monotonic_reti128(ptr %ptr, i128 %cmp, i128 %val) nounwind { +; LA64-LABEL: cmpxchg_i128_monotonic_monotonic_reti128: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $a2, $sp, 8 +; LA64-NEXT: st.d $a1, $sp, 0 +; LA64-NEXT: addi.d $a1, $sp, 0 +; LA64-NEXT: move $a2, $a3 +; LA64-NEXT: move $a3, $a4 +; LA64-NEXT: move $a4, $zero +; LA64-NEXT: move $a5, $zero +; LA64-NEXT: bl %plt(__atomic_compare_exchange_16) +; LA64-NEXT: ld.d $a1, $sp, 8 +; LA64-NEXT: ld.d $a0, $sp, 0 +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret +; +; NO-LD-SEQ-SA-LABEL: cmpxchg_i128_monotonic_monotonic_reti128: +; NO-LD-SEQ-SA: # %bb.0: +; NO-LD-SEQ-SA-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; NO-LD-SEQ-SA-NEXT: ll.d $a5, $a0, 0 +; NO-LD-SEQ-SA-NEXT: dbar 20 +; NO-LD-SEQ-SA-NEXT: ld.d $a6, $a0, 8 +; NO-LD-SEQ-SA-NEXT: bne $a5, $a1, .LBB5_3 +; NO-LD-SEQ-SA-NEXT: bne $a6, $a2, .LBB5_3 +; NO-LD-SEQ-SA-NEXT: # %bb.2: # in Loop: Header=BB5_1 Depth=1 +; NO-LD-SEQ-SA-NEXT: move $a7, $a3 +; NO-LD-SEQ-SA-NEXT: sc.q $a7, $a4, $a0 +; NO-LD-SEQ-SA-NEXT: beqz $a7, .LBB5_1 +; NO-LD-SEQ-SA-NEXT: b .LBB5_4 +; NO-LD-SEQ-SA-NEXT: .LBB5_3: +; NO-LD-SEQ-SA-NEXT: dbar 1792 +; NO-LD-SEQ-SA-NEXT: .LBB5_4: +; NO-LD-SEQ-SA-NEXT: move $a0, $a5 +; NO-LD-SEQ-SA-NEXT: move $a1, $a6 +; NO-LD-SEQ-SA-NEXT: ret +; +; LD-SEQ-SA-LABEL: cmpxchg_i128_monotonic_monotonic_reti128: +; LD-SEQ-SA: # %bb.0: +; LD-SEQ-SA-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; LD-SEQ-SA-NEXT: ll.d $a5, $a0, 0 +; LD-SEQ-SA-NEXT: dbar 20 +; LD-SEQ-SA-NEXT: ld.d $a6, $a0, 8 +; LD-SEQ-SA-NEXT: bne $a5, $a1, .LBB5_3 +; LD-SEQ-SA-NEXT: bne $a6, $a2, .LBB5_3 +; LD-SEQ-SA-NEXT: # %bb.2: # in Loop: Header=BB5_1 Depth=1 +; LD-SEQ-SA-NEXT: move $a7, $a3 +; LD-SEQ-SA-NEXT: sc.q $a7, $a4, $a0 +; LD-SEQ-SA-NEXT: beqz $a7, .LBB5_1 +; LD-SEQ-SA-NEXT: b .LBB5_4 +; LD-SEQ-SA-NEXT: .LBB5_3: +; LD-SEQ-SA-NEXT: .LBB5_4: +; LD-SEQ-SA-NEXT: move $a0, $a5 +; LD-SEQ-SA-NEXT: move $a1, $a6 +; LD-SEQ-SA-NEXT: ret + %tmp = cmpxchg ptr %ptr, i128 %cmp, i128 %val monotonic monotonic + %res = extractvalue { i128, i1 } %tmp, 0 + ret i128 %res +} + +define i1 @cmpxchg_i128_monotonic_monotonic_reti1(ptr %ptr, i128 %cmp, i128 %val) nounwind { +; LA64-LABEL: cmpxchg_i128_monotonic_monotonic_reti1: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $a2, $sp, 8 +; LA64-NEXT: st.d $a1, $sp, 0 +; LA64-NEXT: addi.d $a1, $sp, 0 +; LA64-NEXT: move $a2, $a3 +; LA64-NEXT: move $a3, $a4 +; LA64-NEXT: move $a4, $zero +; LA64-NEXT: move $a5, $zero +; LA64-NEXT: bl %plt(__atomic_compare_exchange_16) +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret +; +; NO-LD-SEQ-SA-LABEL: cmpxchg_i128_monotonic_monotonic_reti1: +; NO-LD-SEQ-SA: # %bb.0: +; NO-LD-SEQ-SA-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; NO-LD-SEQ-SA-NEXT: ll.d $a5, $a0, 0 +; NO-LD-SEQ-SA-NEXT: dbar 20 +; NO-LD-SEQ-SA-NEXT: ld.d $a6, $a0, 8 +; NO-LD-SEQ-SA-NEXT: bne $a5, $a1, .LBB6_3 +; NO-LD-SEQ-SA-NEXT: bne $a6, $a2, .LBB6_3 +; NO-LD-SEQ-SA-NEXT: # %bb.2: # in Loop: Header=BB6_1 Depth=1 +; NO-LD-SEQ-SA-NEXT: move $a7, $a3 +; NO-LD-SEQ-SA-NEXT: sc.q $a7, $a4, $a0 +; NO-LD-SEQ-SA-NEXT: beqz $a7, .LBB6_1 +; NO-LD-SEQ-SA-NEXT: b .LBB6_4 +; NO-LD-SEQ-SA-NEXT: .LBB6_3: +; NO-LD-SEQ-SA-NEXT: dbar 1792 +; NO-LD-SEQ-SA-NEXT: .LBB6_4: +; NO-LD-SEQ-SA-NEXT: xor $a0, $a6, $a2 +; NO-LD-SEQ-SA-NEXT: xor $a1, $a5, $a1 +; NO-LD-SEQ-SA-NEXT: or $a0, $a1, $a0 +; NO-LD-SEQ-SA-NEXT: sltui $a0, $a0, 1 +; NO-LD-SEQ-SA-NEXT: ret +; +; LD-SEQ-SA-LABEL: cmpxchg_i128_monotonic_monotonic_reti1: +; LD-SEQ-SA: # %bb.0: +; LD-SEQ-SA-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; LD-SEQ-SA-NEXT: ll.d $a5, $a0, 0 +; LD-SEQ-SA-NEXT: dbar 20 +; LD-SEQ-SA-NEXT: ld.d $a6, $a0, 8 +; LD-SEQ-SA-NEXT: bne $a5, $a1, .LBB6_3 +; LD-SEQ-SA-NEXT: bne $a6, $a2, .LBB6_3 +; LD-SEQ-SA-NEXT: # %bb.2: # in Loop: Header=BB6_1 Depth=1 +; LD-SEQ-SA-NEXT: move $a7, $a3 +; LD-SEQ-SA-NEXT: sc.q $a7, $a4, $a0 +; LD-SEQ-SA-NEXT: beqz $a7, .LBB6_1 +; LD-SEQ-SA-NEXT: b .LBB6_4 +; LD-SEQ-SA-NEXT: .LBB6_3: +; LD-SEQ-SA-NEXT: .LBB6_4: +; LD-SEQ-SA-NEXT: xor $a0, $a6, $a2 +; LD-SEQ-SA-NEXT: xor $a1, $a5, $a1 +; LD-SEQ-SA-NEXT: or $a0, $a1, $a0 +; LD-SEQ-SA-NEXT: sltui $a0, $a0, 1 +; LD-SEQ-SA-NEXT: ret + %tmp = cmpxchg ptr %ptr, i128 %cmp, i128 %val monotonic monotonic + %res = extractvalue { i128, i1 } %tmp, 1 + ret i1 %res +} From 0bcf34e422683b900ed504c5e4605038b257f1ee Mon Sep 17 00:00:00 2001 From: MagentaTreehouse <99200384+MagentaTreehouse@users.noreply.github.com> Date: Wed, 22 Jan 2025 23:27:59 -0500 Subject: [PATCH 084/208] [Clang] [NFC] Mark `UnresolvedSetImpl`'s move operations as defaulted (#97930) --- clang/include/clang/AST/UnresolvedSet.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/AST/UnresolvedSet.h b/clang/include/clang/AST/UnresolvedSet.h index 1369725ab4e96a..46daf32b7eba26 100644 --- a/clang/include/clang/AST/UnresolvedSet.h +++ b/clang/include/clang/AST/UnresolvedSet.h @@ -71,9 +71,8 @@ class UnresolvedSetImpl { UnresolvedSetImpl(const UnresolvedSetImpl &) = default; UnresolvedSetImpl &operator=(const UnresolvedSetImpl &) = default; - // FIXME: Switch these to "= default" once MSVC supports generating move ops - UnresolvedSetImpl(UnresolvedSetImpl &&) {} - UnresolvedSetImpl &operator=(UnresolvedSetImpl &&) { return *this; } + UnresolvedSetImpl(UnresolvedSetImpl &&) = default; + UnresolvedSetImpl &operator=(UnresolvedSetImpl &&) = default; public: // We don't currently support assignment through this iterator, so we might From d80b814c010580b0fd02c1b1a9521a0b640a358a Mon Sep 17 00:00:00 2001 From: Weining Lu Date: Thu, 23 Jan 2025 10:13:27 +0800 Subject: [PATCH 085/208] [LoongArch] Summary llvm20 release notes --- llvm/docs/ReleaseNotes.md | 61 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 50cdaafcb3bb77..5f9f2f7f9c329a 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -177,6 +177,57 @@ Changes to the Hexagon Backend Changes to the LoongArch Backend -------------------------------- +* [Incorrect GOT usage](https://github.com/llvm/llvm-project/pull/117099) for `non-dso_local` function calls in large code model is fixed. + +* A [gprof support issue](https://github.com/llvm/llvm-project/issues/121103) is fixed. + +* A [SDAG hang issue](https://github.com/llvm/llvm-project/issues/107355) caused by `ISD::CONCAT_VECTORS` is fixed. + +* A [compiler crash issue](https://github.com/llvm/llvm-project/issues/118301) when converting `half` to `i32` is fixed. + +* Almost all of `la64v1.1` instructions can now be generated. The full list is + `frecipe.s`, `frecipe.d`, `frsqrte.s`, `frsqrte.d`, `vfrecipe.s`, `vfrecipe.d`, + `vfrsqrte.s`, `vfrsqrte.d`, `xvfrecipe.s`, `xvfrecipe.d`, `xvfrsqrte.s`, + `xvfrsqrte.d`, `sc.q`, `amcas.b`, `amcas.h`, `amcas.w`, `amcas.d`, `amcas_db.b`, + `amcas_db.h`, `amcas_db.w`, `amcas_db.d`, `amswap.b`, `amswap.h`, `amswap_db.b`, + `amswap_db.h`, `amadd.b`, `amadd.h`, `amadd_db.b`, `amadd_db.h`. Optionally + generate instructions `dbar 0x700`, `div.w`, `div.wu`, `mod.w` and `mod.wu` + when related target features are enabled. `llacq.w`, `screl.w`, `llacq.d` and + `screl.d` cannot be generated yet. + +* An llc option called `-loongarch-annotate-tablejump` is added to annotate + table jump instruction in the `.discard.tablejump_annotate` section. A typical + user of these annotations is the `objtool` in Linux kernel. + +* The default cpu in `MCSubtargetInfo` is changed from `la464` to `generic-la64`. + In addition, the `lsx` feature is added to `generic-la64`. + +* CFI instructions now allow register names and aliases, previously only numbers + were allowed. + +* `RuntimeDyld` now supports LoongArch, which means that programs relying on + `MCJIT` can now work. + +* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill + the required alignment space with a sequence of `0x0` bytes (the requested + fill value) rather than NOPs. + +* `%ld_pcrel_20`, `%gd_pcrel_20` and `%desc_pcrel_20` operand modifiers are + supported by assembler. + +* A machine function pass called `LoongArch Merge Base Offset` is added to merge + the offset of address calculation into the offset field of instructions in a + global address lowering sequence. + +* The `LoopDataPrefetch` pass can now work on LoongArch, but it is disabled by + default due to the bad effect on Fortran benchmarks. + +* Enable alias analysis by default. + +* Avoid indirect branch jumps using the `$ra` register. + +* Other optimizations. + Changes to the MIPS Backend --------------------------- @@ -489,11 +540,19 @@ Changes to LLDB to be opened in the firewall (one for the `lldb-server` platform, one for gdbserver connections). In addition, due to this work, `lldb-server` now works on Windows in the server mode. -* LLDB now supports execution of user expressions for non-trivial cases for RISC-V targets, like function calls, when some code needs to be executed on the target. +* LLDB now supports execution of user expressions for non-trivial cases for LoongArch and RISC-V targets, like function calls, when some code needs to be executed on the target. * LLDB now supports optionally enabled/disabled register sets (particularly floating point registers) for RISC-V 64. This happens for targets like `RV64IMAC` or `RV64IMACV`, that have no floating point registers. The change is applied to native debugging and core-file usage. +* LLDB now supports [core-file for LoongArch](https://github.com/llvm/llvm-project/pull/112296). + +* LLDB now supports [hardware breakpoint and watchpoint for LoongArch](https://github.com/llvm/llvm-project/pull/118770). + +* LLDB now supports [vector registers for LoongArch](https://github.com/llvm/llvm-project/pull/120664) when debugging a live process. + +* Incorrect floating-point register dwarf number for LoongArch is [fixed](https://github.com/llvm/llvm-project/pull/120391). + Changes to BOLT --------------------------------- From 3c7a878d919c6483c9e78a3ed4578d4ee2f54408 Mon Sep 17 00:00:00 2001 From: Weining Lu Date: Thu, 23 Jan 2025 10:13:38 +0800 Subject: [PATCH 086/208] [LoongArch] Summary clang20 release notes --- clang/docs/ReleaseNotes.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index abc9ce60e7d01f..75931bb25f06d9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1135,6 +1135,20 @@ Windows Support LoongArch Support ^^^^^^^^^^^^^^^^^ +- Types of parameters and return value of ``__builtin_lsx_vorn_v`` and ``__builtin_lasx_xvorn_v`` + are changed from ``signed char`` to ``unsigned char``. (#GH114514) + +- ``-mrelax`` and ``-mno-relax`` are supported now on LoongArch that can be used + to enable / disable the linker relaxation optimization. (#GH123587) + +- Fine-grained la64v1.1 options are added including ``-m{no-,}frecipe``, ``-m{no-,}lam-bh``, + ``-m{no-,}ld-seq-sa``, ``-m{no-,}div32``, ``-m{no-,}lamcas`` and ``-m{no-,}scq``. + +- Two options ``-m{no-,}annotate-tablejump`` are added to enable / disable + annotating table jump instruction to correlate it with the jump table. (#GH102411) + +- FreeBSD support is added for LoongArch64 and has been tested by building kernel-toolchain. (#GH119191) + RISC-V Support ^^^^^^^^^^^^^^ From aa273fd83eccb55215f4cb18285f8462a1013f5c Mon Sep 17 00:00:00 2001 From: Weining Lu Date: Thu, 23 Jan 2025 10:11:38 +0800 Subject: [PATCH 087/208] [LoongArch] Update lld20 release notes --- lld/docs/ReleaseNotes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst index 0c24156ae16799..1995c80df652cb 100644 --- a/lld/docs/ReleaseNotes.rst +++ b/lld/docs/ReleaseNotes.rst @@ -40,6 +40,8 @@ ELF Improvements * ``R_X86_64_CODE_4_GOTTPOFF`` (`#116634 `_) * ``R_X86_64_CODE_4_GOTPC32_TLSDESC`` (`#116909 `_) * ``R_X86_64_CODE_6_GOTTPOFF`` (`#117675 `_) +* Supported relocation types for LoongArch target: ``R_LARCH_TLS_{LD,GD,DESC}_PCREL20_S2``. + (`#100105 `_) Breaking changes ---------------- From 163935a48df69bde944fae2b4581541dab30c730 Mon Sep 17 00:00:00 2001 From: quic_hchandel <165007698+hchandel@users.noreply.github.com> Date: Thu, 23 Jan 2025 10:14:25 +0530 Subject: [PATCH 088/208] [RISCV] Add Qualcomm uC Xqcilo (Large Offset Load Store) extension (#123881) This extension adds eight 48 bit load store instructions. The current spec can be found at: https://github.com/quic/riscv-unified-db/releases/latest This patch adds assembler only support. --------- Co-authored-by: Harsh Chandel --- .../Driver/print-supported-extensions-riscv.c | 1 + llvm/docs/RISCVUsage.rst | 3 + llvm/docs/ReleaseNotes.md | 2 + .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 13 ++ .../RISCV/Disassembler/RISCVDisassembler.cpp | 28 ++++- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 8 ++ llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 1 + llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 80 ++++++++++++ llvm/lib/TargetParser/RISCVISAInfo.cpp | 4 +- llvm/test/CodeGen/RISCV/attributes.ll | 2 + llvm/test/MC/RISCV/xqcilo-aliases-valid.s | 50 ++++++++ llvm/test/MC/RISCV/xqcilo-invalid.s | 108 +++++++++++++++++ llvm/test/MC/RISCV/xqcilo-valid.s | 114 ++++++++++++++++++ .../TargetParser/RISCVISAInfoTest.cpp | 4 +- 15 files changed, 414 insertions(+), 5 deletions(-) create mode 100644 llvm/test/MC/RISCV/xqcilo-aliases-valid.s create mode 100644 llvm/test/MC/RISCV/xqcilo-invalid.s create mode 100644 llvm/test/MC/RISCV/xqcilo-valid.s diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index b28e0a07dad241..ae3a1c29df3976 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -197,6 +197,7 @@ // CHECK-NEXT: xqcics 0.2 'Xqcics' (Qualcomm uC Conditional Select Extension) // CHECK-NEXT: xqcicsr 0.2 'Xqcicsr' (Qualcomm uC CSR Extension) // CHECK-NEXT: xqciint 0.2 'Xqciint' (Qualcomm uC Interrupts Extension) +// CHECK-NEXT: xqcilo 0.2 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) // CHECK-NEXT: xqcilsm 0.2 'Xqcilsm' (Qualcomm uC Load Store Multiple Extension) // CHECK-NEXT: xqcisls 0.2 'Xqcisls' (Qualcomm uC Scaled Load Store Extension) // CHECK-EMPTY: diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index a1df0f7d686e62..c83fd1db0ba9b5 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -450,6 +450,9 @@ The current vendor extensions supported are: ``experimental-Xqciint`` LLVM implements `version 0.2 of the Qualcomm uC Interrupts extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. +``experimental-Xqcilo`` + LLVM implements `version 0.2 of the Qualcomm uC Large Offset Load Store extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. + ``experimental-Xqcilsm`` LLVM implements `version 0.2 of the Qualcomm uC Load Store Multiple extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 5f9f2f7f9c329a..eb6e9c9b75beb5 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -294,6 +294,8 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm uC 'Xqciint` (Interrupts) extension. +* Adds experimental assembler support for the Qualcomm uC 'Xqcilo` (Large Offset Load Store) + extension. * Added ``Sdext`` and ``Sdtrig`` extensions. Changes to the WebAssembly Backend diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 8177280044bf44..227a6361730da6 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1036,6 +1036,16 @@ struct RISCVOperand final : public MCParsedAsmOperand { VK == RISCVMCExpr::VK_RISCV_None; } + bool isSImm26() const { + if (!isImm()) + return false; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; + int64_t Imm; + bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); + return IsConstantImm && (VK == RISCVMCExpr::VK_RISCV_None) && + isInt<26>(fixImmediateForRV32(Imm, isRV64Imm())); + } + /// getStartLoc - Gets location of the first token of this operand SMLoc getStartLoc() const override { return StartLoc; } /// getEndLoc - Gets location of the last token of this operand @@ -1676,6 +1686,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, (1 << 4), "immediate must be in the range"); } + case Match_InvalidSImm26: + return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 25), + (1 << 25) - 1); case Match_InvalidRlist: { SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); return Error( diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 971ef90c63327d..a0b87f7c7ff257 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -45,6 +45,10 @@ class RISCVDisassembler : public MCDisassembler { private: void addSPOperands(MCInst &MI) const; + DecodeStatus getInstruction48(MCInst &Instr, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &CStream) const; + DecodeStatus getInstruction32(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, raw_ostream &CStream) const; @@ -745,6 +749,27 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size, return MCDisassembler::Fail; } +DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &CS) const { + if (Bytes.size() < 6) { + Size = 0; + return MCDisassembler::Fail; + } + Size = 6; + + uint64_t Insn = 0; + for (size_t i = Size; i-- != 0;) { + Insn += (static_cast(Bytes[i]) << 8 * i); + } + TRY_TO_DECODE_FEATURE( + RISCV::FeatureVendorXqcilo, DecoderTableXqcilo48, + "Qualcomm uC Large Offset Load Store custom 48bit opcode table"); + + return MCDisassembler::Fail; +} + DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes, uint64_t Address, @@ -760,8 +785,7 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // 48-bit instructions are encoded as 0bxx011111. if ((Bytes[0] & 0b11'1111) == 0b01'1111) { - Size = Bytes.size() >= 6 ? 6 : 0; - return MCDisassembler::Fail; + return getInstruction48(MI, Size, Bytes, Address, CS); } // 64-bit instructions are encoded as 0x0111111. diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index ab04b09a7ad151..e9abc90d69a131 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -332,6 +332,7 @@ enum OperandType : unsigned { OPERAND_SIMM10_LSB0000_NONZERO, OPERAND_SIMM12, OPERAND_SIMM12_LSB00000, + OPERAND_SIMM26, OPERAND_CLUI_IMM, OPERAND_VTYPEI10, OPERAND_VTYPEI11, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index f721d7148526ba..4119dd77804f1a 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1310,6 +1310,14 @@ def HasVendorXqciint AssemblerPredicate<(all_of FeatureVendorXqciint), "'Xqciint' (Qualcomm uC Interrupts Extension)">; +def FeatureVendorXqcilo + : RISCVExperimentalExtension<0, 2, "Qualcomm uC Large Offset Load Store Extension", + [FeatureStdExtZca]>; +def HasVendorXqcilo + : Predicate<"Subtarget->hasVendorXqcilo()">, + AssemblerPredicate<(all_of FeatureVendorXqcilo), + "'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index e6678a795c807f..bd02880b0d7129 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2513,6 +2513,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, CASE_OPERAND_SIMM(5) CASE_OPERAND_SIMM(6) CASE_OPERAND_SIMM(12) + CASE_OPERAND_SIMM(26) // clang-format on case RISCVOp::OPERAND_SIMM5_PLUS1: Ok = (isInt<5>(Imm) && Imm != -16) || Imm == 16; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index ce8c0c0a3d4e5d..f746cce8c9a0f1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -32,6 +32,8 @@ def uimm10 : RISCVUImmLeafOp<10>; def uimm11 : RISCVUImmLeafOp<11>; +def simm26 : RISCVSImmLeafOp<26>; + //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -198,6 +200,51 @@ class QCIInt_IMM funct1, string opcodestr> let Inst{24-20} = imm10{9-5}; } +class QCIRVInstEIBase funct3, bits<2> funct2, dag outs, + dag ins, string opcodestr, string argstr> + : RVInst48 { + bits<5> rd; + bits<5> rs1; + bits<26> imm; + + let Inst{47-32} = imm{25-10}; + let Inst{31-30} = funct2; + let Inst{29-20} = imm{9-0}; + let Inst{19-15} = rs1; + let Inst{14-12} = funct3; + let Inst{11-7} = rd; + let Inst{6-0} = 0b0011111; +} + +let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in +class QCIRVInstEILoad funct3, bits<2> funct2, string opcodestr> + : QCIRVInstEIBase; + +class QCIRVInstESBase funct3, bits<2> funct2, dag outs, + dag ins, string opcodestr, string argstr> + : RVInst48 { + bits<5> rs1; + bits<5> rs2; + bits<26> imm; + + let Inst{47-32} = imm{25-10}; + let Inst{31-30} = funct2; + let Inst{29-25} = imm{9-5}; + let Inst{24-20} = rs2; + let Inst{19-15} = rs1; + let Inst{14-12} = funct3; + let Inst{11-7} = imm{4-0}; + let Inst{6-0} = 0b0011111; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in +class QCIRVInstESStore funct3, bits<2> funct2, string opcodestr> + : QCIRVInstESBase; + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -376,6 +423,18 @@ let Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" in { def QC_C_MILEAVERET : QCIRVInst16CI_NONE<0b10100, "qc.c.mileaveret">; } // Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" +let Predicates = [HasVendorXqcilo, IsRV32], DecoderNamespace = "Xqcilo" in { + def QC_E_LB : QCIRVInstEILoad<0b101, 0b00, "qc.e.lb">; + def QC_E_LBU : QCIRVInstEILoad<0b101, 0b01, "qc.e.lbu">; + def QC_E_LH : QCIRVInstEILoad<0b101, 0b10, "qc.e.lh">; + def QC_E_LHU : QCIRVInstEILoad<0b101, 0b11, "qc.e.lhu">; + def QC_E_LW : QCIRVInstEILoad<0b110, 0b00, "qc.e.lw">; + + def QC_E_SB : QCIRVInstESStore<0b110, 0b01, "qc.e.sb">; + def QC_E_SH : QCIRVInstESStore<0b110, 0b10, "qc.e.sh">; + def QC_E_SW : QCIRVInstESStore<0b110, 0b11, "qc.e.sw">; +} // Predicates = [HasVendorXqcilo, IsRV32], DecoderNamespace = "Xqcilo" + //===----------------------------------------------------------------------===// // Aliases //===----------------------------------------------------------------------===// @@ -396,3 +455,24 @@ let EmitPriority = 0 in { (QC_LWMI GPRNoX0:$rd, GPR:$rs1, uimm5nonzero:$length, 0)>; } // EmitPriority = 0 } // Predicates = [HasVendorXqcilsm, IsRV32] + +let Predicates = [HasVendorXqcilo, IsRV32] in { +let EmitPriority = 0 in { + def : InstAlias<"qc.e.lb $rd, (${rs1})", + (QC_E_LB GPR:$rd, GPR:$rs1, 0)>; + def : InstAlias<"qc.e.lbu $rd, (${rs1})", + (QC_E_LBU GPR:$rd, GPR:$rs1, 0)>; + def : InstAlias<"qc.e.lh $rd, (${rs1})", + (QC_E_LH GPR:$rd, GPR:$rs1, 0)>; + def : InstAlias<"qc.e.lhu $rd, (${rs1})", + (QC_E_LHU GPR:$rd, GPR:$rs1, 0)>; + def : InstAlias<"qc.e.lw $rd, (${rs1})", + (QC_E_LW GPR:$rd, GPR:$rs1, 0)>; + def : InstAlias<"qc.e.sb $rs2, (${rs1})", + (QC_E_SB GPR:$rs2, GPR:$rs1, 0)>; + def : InstAlias<"qc.e.sh $rs2, (${rs1})", + (QC_E_SH GPR:$rs2, GPR:$rs1, 0)>; + def : InstAlias<"qc.e.sw $rs2, (${rs1})", + (QC_E_SW GPR:$rs2, GPR:$rs1, 0)>; +} // EmitPriority = 0 +} // Predicates = [HasVendorXqcilo, IsRV32] diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index 1995931abfe418..c78d60fd86b3fc 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -742,8 +742,8 @@ Error RISCVISAInfo::checkDependency() { bool HasZvl = MinVLen != 0; bool HasZcmt = Exts.count("zcmt") != 0; static constexpr StringLiteral XqciExts[] = { - {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcicm"}, {"xqcics"}, - {"xqcicsr"}, {"xqciint"}, {"xqcilsm"}, {"xqcisls"}}; + {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcicm"}, {"xqcics"}, + {"xqcicsr"}, {"xqciint"}, {"xqcilo"}, {"xqcilsm"}, {"xqcisls"}}; if (HasI && HasE) return getIncompatibleError("i", "e"); diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index a09261609d8441..caed0bdfb04984 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -88,6 +88,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciint %s -o - | FileCheck --check-prefix=RV32XQCIINT %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilo %s -o - | FileCheck --check-prefix=RV32XQCILO %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s @@ -403,6 +404,7 @@ ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2" ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2" ; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p2" +; RV32XQCILO: .attribute 5, "rv32i2p1_zca1p0_xqcilo0p2" ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2" ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2" ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0" diff --git a/llvm/test/MC/RISCV/xqcilo-aliases-valid.s b/llvm/test/MC/RISCV/xqcilo-aliases-valid.s new file mode 100644 index 00000000000000..dddd76260b248e --- /dev/null +++ b/llvm/test/MC/RISCV/xqcilo-aliases-valid.s @@ -0,0 +1,50 @@ +# Xqcilo - Qualcomm uC Large Offset Load Store extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilo -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilo < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcilo -M no-aliases --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilo -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilo < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcilo --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s + +# CHECK-INST: qc.e.lb a1, 0(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0x00,0x00,0x00] +qc.e.lb x11, (x10) + + +# CHECK-INST: qc.e.lbu a1, 0(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0x40,0x00,0x00] +qc.e.lbu x11, (x10) + + +# CHECK-INST: qc.e.lh a1, 0(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0x80,0x00,0x00] +qc.e.lh x11, (x10) + + +# CHECK-INST: qc.e.lhu a1, 0(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0xc0,0x00,0x00] +qc.e.lhu x11, (x10) + + +# CHECK-INST: qc.e.lw a1, 0(a0) +# CHECK-ENC: encoding: [0x9f,0x65,0x05,0x00,0x00,0x00] +qc.e.lw x11, (x10) + + +# CHECK-INST: qc.e.sb a1, 0(a0) +# CHECK-ENC: encoding: [0x1f,0x60,0xb5,0x40,0x00,0x00] +qc.e.sb x11, (x10) + + +# CHECK-INST: qc.e.sh a1, 0(a0) +# CHECK-ENC: encoding: [0x1f,0x60,0xb5,0x80,0x00,0x00] +qc.e.sh x11, (x10) + + +# CHECK-INST: qc.e.sw a1, 0(a0) +# CHECK-ENC: encoding: [0x1f,0x60,0xb5,0xc0,0x00,0x00] +qc.e.sw x11, (x10) diff --git a/llvm/test/MC/RISCV/xqcilo-invalid.s b/llvm/test/MC/RISCV/xqcilo-invalid.s new file mode 100644 index 00000000000000..c298f94ece7596 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcilo-invalid.s @@ -0,0 +1,108 @@ +# Xqcilo - Qualcomm uC Extension Large Offset Load Store extension +# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcilo < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-IMM %s +# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcilo < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-EXT %s + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.e.lb 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.lb x11 + +# CHECK-IMM: :[[@LINE+1]]:14: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.lb x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.lb x11, 12(x10) + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.e.lbu 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.lbu x11 + +# CHECK-IMM: :[[@LINE+1]]:15: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.lbu x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.lbu x11, 12(x10) + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.e.lh 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.lh x11 + +# CHECK-IMM: :[[@LINE+1]]:14: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.lh x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.lh x11, 12(x10) + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.e.lhu 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.lhu x11 + +# CHECK-IMM: :[[@LINE+1]]:15: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.lhu x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.lhu x11, 12(x10) + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.e.lw 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.lw x11 + +# CHECK-IMM: :[[@LINE+1]]:14: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.lw x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.lw x11, 12(x10) + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.e.sb 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.sb x11 + +# CHECK-IMM: :[[@LINE+1]]:14: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.sb x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.sb x11, 12(x10) + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.e.sh 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.sh x11 + +# CHECK-IMM: :[[@LINE+1]]:14: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.sh x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.sh x11, 12(x10) + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.e.sw 11, 12(x10) + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.e.sw x11 + +# CHECK-IMM: :[[@LINE+1]]:14: error: immediate must be an integer in the range [-33554432, 33554431] +qc.e.sw x11, 33445562212(x10) + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension) +qc.e.sw x11, 12(x10) diff --git a/llvm/test/MC/RISCV/xqcilo-valid.s b/llvm/test/MC/RISCV/xqcilo-valid.s new file mode 100644 index 00000000000000..ce486e39313ab8 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcilo-valid.s @@ -0,0 +1,114 @@ +# Xqcilo - Qualcomm uC Large Offset Load Store extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilo -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilo < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcilo -M no-aliases --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilo -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilo < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcilo --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s + +# CHECK-INST: qc.e.lb a1, 12(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xc5,0x00,0x00,0x00] +qc.e.lb x11, 12(x10) + +# CHECK-INST: qc.e.lb a1, -33554432(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0x00,0x00,0x80] +qc.e.lb x11, -33554432(x10) + +# CHECK-INST: qc.e.lb a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xf5,0x3f,0xff,0x7f] +qc.e.lb x11, 33554431(x10) + + +# CHECK-INST: qc.e.lbu a1, 12(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xc5,0x40,0x00,0x00] +qc.e.lbu x11, 12(x10) + +# CHECK-INST: qc.e.lbu a1, -33554432(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0x40,0x00,0x80] +qc.e.lbu x11, -33554432(x10) + +# CHECK-INST: qc.e.lbu a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xf5,0x7f,0xff,0x7f] +qc.e.lbu x11, 33554431(x10) + + +# CHECK-INST: qc.e.lh a1, 12(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xc5,0x80,0x00,0x00] +qc.e.lh x11, 12(x10) + +# CHECK-INST: qc.e.lh a1, -33554432(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0x80,0x00,0x80] +qc.e.lh x11, -33554432(x10) + +# CHECK-INST: qc.e.lh a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xf5,0xbf,0xff,0x7f] +qc.e.lh x11, 33554431(x10) + + +# CHECK-INST: qc.e.lhu a1, 12(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xc5,0xc0,0x00,0x00] +qc.e.lhu x11, 12(x10) + +# CHECK-INST: qc.e.lhu a1, -33554432(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0x05,0xc0,0x00,0x80] +qc.e.lhu x11, -33554432(x10) + +# CHECK-INST: qc.e.lhu a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x55,0xf5,0xff,0xff,0x7f] +qc.e.lhu x11, 33554431(x10) + + +# CHECK-INST: qc.e.lw a1, 12(a0) +# CHECK-ENC: encoding: [0x9f,0x65,0xc5,0x00,0x00,0x00] +qc.e.lw x11, 12(x10) + +# CHECK-INST: qc.e.lw a1, -33554432(a0) +# CHECK-ENC: encoding: [0x9f,0x65,0x05,0x00,0x00,0x80] +qc.e.lw x11, -33554432(x10) + +# CHECK-INST: qc.e.lw a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x65,0xf5,0x3f,0xff,0x7f] +qc.e.lw x11, 33554431(x10) + + +# CHECK-INST: qc.e.sb a1, 12(a0) +# CHECK-ENC: encoding: [0x1f,0x66,0xb5,0x40,0x00,0x00] +qc.e.sb x11, 12(x10) + +# CHECK-INST: qc.e.sb a1, -33554432(a0) +# CHECK-ENC: encoding: [0x1f,0x60,0xb5,0x40,0x00,0x80] +qc.e.sb x11, -33554432(x10) + +# CHECK-INST: qc.e.sb a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x6f,0xb5,0x7e,0xff,0x7f] +qc.e.sb x11, 33554431(x10) + + +# CHECK-INST: qc.e.sh a1, 12(a0) +# CHECK-ENC: encoding: [0x1f,0x66,0xb5,0x80,0x00,0x00] +qc.e.sh x11, 12(x10) + +# CHECK-INST: qc.e.sh a1, -33554432(a0) +# CHECK-ENC: encoding: [0x1f,0x60,0xb5,0x80,0x00,0x80] +qc.e.sh x11, -33554432(x10) + +# CHECK-INST: qc.e.sh a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x6f,0xb5,0xbe,0xff,0x7f] +qc.e.sh x11, 33554431(x10) + + +# CHECK-INST: qc.e.sw a1, 12(a0) +# CHECK-ENC: encoding: [0x1f,0x66,0xb5,0xc0,0x00,0x00] +qc.e.sw x11, 12(x10) + +# CHECK-INST: qc.e.sw a1, -33554432(a0) +# CHECK-ENC: encoding: [0x1f,0x60,0xb5,0xc0,0x00,0x80] +qc.e.sw x11, -33554432(x10) + +# CHECK-INST: qc.e.sw a1, 33554431(a0) +# CHECK-ENC: encoding: [0x9f,0x6f,0xb5,0xfe,0xff,0x7f] +qc.e.sw x11, 33554431(x10) diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 3a7ea4550d4173..14a60c1857f24f 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -656,7 +656,8 @@ TEST(ParseArchString, RejectsConflictingExtensions) { for (StringRef Input : {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2", "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcicm0p2", - "rv64i_xqcics0p2", "rv64i_xqcicli0p2", "rv64i_xqciint0p2"}) { + "rv64i_xqcics0p2", "rv64i_xqcicli0p2", "rv64i_xqciint0p2", + "rv64i_xqcilo0p2"}) { EXPECT_THAT( toString(RISCVISAInfo::parseArchString(Input, true).takeError()), ::testing::EndsWith(" is only supported for 'rv32'")); @@ -1122,6 +1123,7 @@ Experimental extensions xqcics 0.2 xqcicsr 0.2 xqciint 0.2 + xqcilo 0.2 xqcilsm 0.2 xqcisls 0.2 From de209fa11b5455155228bcdba012b6074388b917 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Wed, 22 Jan 2025 21:06:46 -0800 Subject: [PATCH 089/208] [CodeGen] Introduce Static Data Splitter pass (#122183) https://discourse.llvm.org/t/rfc-profile-guided-static-data-partitioning/83744 proposes to partition static data sections. This patch introduces a codegen pass. This patch produces jump table hotness in the in-memory states (machine jump table info and entries). Target-lowering and asm-printer consume the states and produce `.hot` section suffix. The follow up PR https://github.com/llvm/llvm-project/pull/122215 implements such changes. --------- Co-authored-by: Ellis Hoag --- llvm/include/llvm/CodeGen/MachineFunction.h | 9 + .../llvm/CodeGen/MachineJumpTableInfo.h | 13 +- llvm/include/llvm/CodeGen/Passes.h | 4 + llvm/include/llvm/InitializePasses.h | 1 + .../llvm/Passes/MachinePassRegistry.def | 1 + llvm/lib/CodeGen/CMakeLists.txt | 1 + llvm/lib/CodeGen/CodeGen.cpp | 1 + llvm/lib/CodeGen/MachineFunction.cpp | 15 ++ llvm/lib/CodeGen/StaticDataSplitter.cpp | 181 ++++++++++++++++++ llvm/lib/CodeGen/TargetPassConfig.cpp | 7 + llvm/test/CodeGen/X86/jump-table-partition.ll | 177 +++++++++++++++++ 11 files changed, 408 insertions(+), 2 deletions(-) create mode 100644 llvm/lib/CodeGen/StaticDataSplitter.cpp create mode 100644 llvm/test/CodeGen/X86/jump-table-partition.ll diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index d517b5e6647291..c3eb27b9462879 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -88,6 +88,15 @@ template <> struct ilist_callback_traits { } }; +// The hotness of static data tracked by a MachineFunction and not represented +// as a global object in the module IR / MIR. Typical examples are +// MachineJumpTableInfo and MachineConstantPool. +enum class MachineFunctionDataHotness { + Unknown, + Cold, + Hot, +}; + /// MachineFunctionInfo - This class can be derived from and used by targets to /// hold private target-specific information for each MachineFunction. Objects /// of type are accessed/created with MF::getInfo and destroyed when the diff --git a/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h b/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h index e8e9c2f6338e06..56ecbe22ff6dda 100644 --- a/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h +++ b/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h @@ -28,6 +28,7 @@ namespace llvm { class MachineBasicBlock; class DataLayout; class raw_ostream; +enum class MachineFunctionDataHotness; /// MachineJumpTableEntry - One jump table in the jump table info. /// @@ -35,8 +36,11 @@ struct MachineJumpTableEntry { /// MBBs - The vector of basic blocks from which to create the jump table. std::vector MBBs; - explicit MachineJumpTableEntry(const std::vector &M) - : MBBs(M) {} + /// The hotness of MJTE is inferred from the hotness of the source basic + /// block(s) that reference it. + MachineFunctionDataHotness Hotness; + + explicit MachineJumpTableEntry(const std::vector &M); }; class MachineJumpTableInfo { @@ -107,6 +111,11 @@ class MachineJumpTableInfo { return JumpTables; } + // Update machine jump table entry's hotness. Return true if the hotness is + // updated. + bool updateJumpTableEntryHotness(size_t JTI, + MachineFunctionDataHotness Hotness); + /// RemoveJumpTable - Mark the specific index as being dead. This will /// prevent it from being emitted. void RemoveJumpTable(unsigned Idx) { diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index d1fac4a304cffe..b5d2a7e6bf035b 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -71,6 +71,10 @@ namespace llvm { /// using profile information. MachineFunctionPass *createMachineFunctionSplitterPass(); + /// createStaticDataSplitterPass - This pass partitions a static data section + /// into a hot and cold section using profile information. + MachineFunctionPass *createStaticDataSplitterPass(); + /// MachineFunctionPrinter pass - This pass prints out the machine function to /// the given stream as a debugging tool. MachineFunctionPass * diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 1cb9013bc48cc5..8111afcc1fb20f 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -293,6 +293,7 @@ void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &); void initializeSpillPlacementWrapperLegacyPass(PassRegistry &); void initializeStackColoringLegacyPass(PassRegistry &); void initializeStackFrameLayoutAnalysisPassPass(PassRegistry &); +void initializeStaticDataSplitterPass(PassRegistry &); void initializeStackMapLivenessPass(PassRegistry &); void initializeStackProtectorPass(PassRegistry &); void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 8a43197d2d45ea..dfe3514360c3c5 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -235,6 +235,7 @@ DUMMY_MACHINE_FUNCTION_PASS("livedebugvalues", LiveDebugValuesPass) DUMMY_MACHINE_FUNCTION_PASS("lrshrink", LiveRangeShrinkPass) DUMMY_MACHINE_FUNCTION_PASS("machine-combiner", MachineCombinerPass) DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass) +DUMMY_MACHINE_FUNCTION_PASS("static-data-splitter", StaticDataSplitter) DUMMY_MACHINE_FUNCTION_PASS("machine-function-splitter", MachineFunctionSplitterPass) DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass) DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata) diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 145fd2fac8b564..88f863d8204d09 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -226,6 +226,7 @@ add_llvm_component_library(LLVMCodeGen StackMaps.cpp StackProtector.cpp StackSlotColoring.cpp + StaticDataSplitter.cpp SwiftErrorValueTracking.cpp SwitchLoweringUtils.cpp TailDuplication.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 925d9af7d0e06d..ed871519e33bc2 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -131,6 +131,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeStackMapLivenessPass(Registry); initializeStackProtectorPass(Registry); initializeStackSlotColoringPass(Registry); + initializeStaticDataSplitterPass(Registry); initializeStripDebugMachineModulePass(Registry); initializeTailDuplicateLegacyPass(Registry); initializeTargetPassConfigPass(Registry); diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index b8dbe834a4d511..e4824183e8dfcf 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -1311,6 +1311,10 @@ const unsigned MachineFunction::DebugOperandMemNumber = 1000000; // MachineJumpTableInfo implementation //===----------------------------------------------------------------------===// +MachineJumpTableEntry::MachineJumpTableEntry( + const std::vector &MBBs) + : MBBs(MBBs), Hotness(MachineFunctionDataHotness::Unknown) {} + /// Return the size of each entry in the jump table. unsigned MachineJumpTableInfo::getEntrySize(const DataLayout &TD) const { // The size of a jump table entry is 4 bytes unless the entry is just the @@ -1360,6 +1364,17 @@ unsigned MachineJumpTableInfo::createJumpTableIndex( return JumpTables.size()-1; } +bool MachineJumpTableInfo::updateJumpTableEntryHotness( + size_t JTI, MachineFunctionDataHotness Hotness) { + assert(JTI < JumpTables.size() && "Invalid JTI!"); + // Record the largest hotness value. + if (Hotness <= JumpTables[JTI].Hotness) + return false; + + JumpTables[JTI].Hotness = Hotness; + return true; +} + /// If Old is the target of any jump tables, update the jump tables to branch /// to New instead. bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old, diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp new file mode 100644 index 00000000000000..25f02fde8a4b8a --- /dev/null +++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp @@ -0,0 +1,181 @@ +//===- StaticDataSplitter.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The pass uses branch profile data to assign hotness based section qualifiers +// for the following types of static data: +// - Jump tables +// - Constant pools (TODO) +// - Other module-internal data (TODO) +// +// For the original RFC of this pass please see +// https://discourse.llvm.org/t/rfc-profile-guided-static-data-partitioning/83744 + +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/MBFIWrapper.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "static-data-splitter" + +STATISTIC(NumHotJumpTables, "Number of hot jump tables seen"); +STATISTIC(NumColdJumpTables, "Number of cold jump tables seen"); +STATISTIC(NumUnknownJumpTables, + "Number of jump tables with unknown hotness. Option " + "-static-data-default-hotness specifies the hotness."); + +static cl::opt StaticDataDefaultHotness( + "static-data-default-hotness", cl::Hidden, + cl::desc("This option specifies the hotness of static data when profile " + "information is unavailable"), + cl::init(MachineFunctionDataHotness::Hot), + cl::values(clEnumValN(MachineFunctionDataHotness::Hot, "hot", "Hot"), + clEnumValN(MachineFunctionDataHotness::Cold, "cold", "Cold"))); + +class StaticDataSplitter : public MachineFunctionPass { + const MachineBranchProbabilityInfo *MBPI = nullptr; + const MachineBlockFrequencyInfo *MBFI = nullptr; + const ProfileSummaryInfo *PSI = nullptr; + + // Returns true iff any jump table is hot-cold categorized. + bool splitJumpTables(MachineFunction &MF); + + // Same as above but works on functions with profile information. + bool splitJumpTablesWithProfiles(const MachineFunction &MF, + MachineJumpTableInfo &MJTI); + +public: + static char ID; + + StaticDataSplitter() : MachineFunctionPass(ID) { + initializeStaticDataSplitterPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Static Data Splitter"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +bool StaticDataSplitter::runOnMachineFunction(MachineFunction &MF) { + MBPI = &getAnalysis().getMBPI(); + MBFI = &getAnalysis().getMBFI(); + PSI = &getAnalysis().getPSI(); + + return splitJumpTables(MF); +} + +bool StaticDataSplitter::splitJumpTablesWithProfiles( + const MachineFunction &MF, MachineJumpTableInfo &MJTI) { + int NumChangedJumpTables = 0; + + // Jump table could be used by either terminating instructions or + // non-terminating ones, so we walk all instructions and use + // `MachineOperand::isJTI()` to identify jump table operands. + // Similarly, `MachineOperand::isCPI()` can identify constant pool usages + // in the same loop. + for (const auto &MBB : MF) { + for (const MachineInstr &I : MBB) { + for (const MachineOperand &Op : I.operands()) { + if (!Op.isJTI()) + continue; + const int JTI = Op.getIndex(); + // This is not a source block of jump table. + if (JTI == -1) + continue; + + auto Hotness = MachineFunctionDataHotness::Hot; + + // Hotness is based on source basic block hotness. + // TODO: PSI APIs are about instruction hotness. Introduce API for data + // access hotness. + if (PSI->isColdBlock(&MBB, MBFI)) + Hotness = MachineFunctionDataHotness::Cold; + + if (MJTI.updateJumpTableEntryHotness(JTI, Hotness)) + ++NumChangedJumpTables; + } + } + } + return NumChangedJumpTables > 0; +} + +bool StaticDataSplitter::splitJumpTables(MachineFunction &MF) { + MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); + if (!MJTI || MJTI->getJumpTables().empty()) + return false; + + const bool ProfileAvailable = PSI && PSI->hasProfileSummary() && MBFI && + MF.getFunction().hasProfileData(); + auto statOnExit = llvm::make_scope_exit([&] { + if (!AreStatisticsEnabled()) + return; + + if (!ProfileAvailable) { + NumUnknownJumpTables += MJTI->getJumpTables().size(); + return; + } + + for (size_t JTI = 0; JTI < MJTI->getJumpTables().size(); JTI++) { + auto Hotness = MJTI->getJumpTables()[JTI].Hotness; + if (Hotness == MachineFunctionDataHotness::Hot) { + ++NumHotJumpTables; + } else { + assert(Hotness == MachineFunctionDataHotness::Cold && + "A jump table is either hot or cold when profile information is " + "available."); + ++NumColdJumpTables; + } + } + }); + + // Place jump tables according to block hotness if function has profile data. + if (ProfileAvailable) + return splitJumpTablesWithProfiles(MF, *MJTI); + + // If function profile is unavailable (e.g., module not instrumented, or new + // code paths lacking samples), -static-data-default-hotness specifies the + // hotness. + for (size_t JTI = 0; JTI < MJTI->getJumpTables().size(); JTI++) + MF.getJumpTableInfo()->updateJumpTableEntryHotness( + JTI, StaticDataDefaultHotness); + + return true; +} + +char StaticDataSplitter::ID = 0; + +INITIALIZE_PASS_BEGIN(StaticDataSplitter, DEBUG_TYPE, "Split static data", + false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_END(StaticDataSplitter, DEBUG_TYPE, "Split static data", false, + false) + +MachineFunctionPass *llvm::createStaticDataSplitterPass() { + return new StaticDataSplitter(); +} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 5c055896130a1b..d8d9f38da3eae0 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -261,6 +261,11 @@ static cl::opt GCEmptyBlocks("gc-empty-basic-blocks", cl::init(false), cl::Hidden, cl::desc("Enable garbage-collecting empty basic blocks")); +static cl::opt + SplitStaticData("split-static-data", cl::Hidden, cl::init(false), + cl::desc("Split static data sections into hot and cold " + "sections using profile information")); + /// Allow standard passes to be disabled by command line options. This supports /// simple binary flags that either suppress the pass or do nothing. /// i.e. -disable-mypass=false has no effect. @@ -1251,6 +1256,8 @@ void TargetPassConfig::addMachinePasses() { } } addPass(createMachineFunctionSplitterPass()); + if (SplitStaticData) + addPass(createStaticDataSplitterPass()); } // We run the BasicBlockSections pass if either we need BB sections or BB // address map (or both). diff --git a/llvm/test/CodeGen/X86/jump-table-partition.ll b/llvm/test/CodeGen/X86/jump-table-partition.ll new file mode 100644 index 00000000000000..c85338de0c3d4f --- /dev/null +++ b/llvm/test/CodeGen/X86/jump-table-partition.ll @@ -0,0 +1,177 @@ +; -stats requires asserts +; requires: asserts + +; Stop after 'finalize-isel' for simpler MIR, and lower the minimum number of +; jump table entries so 'switch' needs fewer cases to generate a jump table. +; RUN: llc -stop-after=finalize-isel -min-jump-table-entries=2 %s -o %t.mir +; RUN: llc --run-pass=static-data-splitter -stats -x mir %t.mir -o - 2>&1 | FileCheck %s --check-prefix=STAT + +; Tests stat messages are expected. +; COM: Update test to verify section suffixes when target-lowering and assembler changes are implemented. +; COM: Also run static-data-splitter pass with -static-data-default-hotness=cold and check data section suffix. + +; STAT-DAG: 2 static-data-splitter - Number of cold jump tables seen +; STAT-DAG: 2 static-data-splitter - Number of hot jump tables seen +; STAT-DAG: 1 static-data-splitter - Number of jump tables with unknown hotness + +; In function @foo, the 2 switch instructions to jt0.* and jt1.* get lowered to hot jump tables, +; and the 2 switch instructions to jt2.* and jt3.* get lowered to cold jump tables. + +; @func_without_profile doesn't have profiles. It's jump table hotness is unknown. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@str.9 = private constant [7 x i8] c".str.9\00" +@str.10 = private constant [8 x i8] c".str.10\00" +@str.11 = private constant [8 x i8] c".str.11\00" + +@case2 = private constant [7 x i8] c"case 2\00" +@case1 = private constant [7 x i8] c"case 1\00" +@default = private constant [8 x i8] c"default\00" +@jt3 = private constant [4 x i8] c"jt3\00" + +define i32 @foo(i32 %num) !prof !13 { +entry: + %mod3 = sdiv i32 %num, 3 + switch i32 %mod3, label %jt0.default [ + i32 1, label %jt0.bb1 + i32 2, label %jt0.bb2 + ], !prof !14 + +jt0.bb1: + call i32 @puts(ptr @case1) + br label %jt0.epilog + +jt0.bb2: + call i32 @puts(ptr @case2) + br label %jt0.epilog + +jt0.default: + call i32 @puts(ptr @default) + br label %jt0.epilog + +jt0.epilog: + %zero = icmp eq i32 %num, 0 + br i1 %zero, label %cold, label %hot, !prof !15 + +cold: + %c2 = call i32 @transform(i32 %num) + switch i32 %c2, label %jt2.default [ + i32 1, label %jt2.bb1 + i32 2, label %jt2.bb2 + ], !prof !14 + +jt2.bb1: + call i32 @puts(ptr @case1) + br label %jt1.epilog + +jt2.bb2: + call i32 @puts(ptr @case2) + br label %jt1.epilog + +jt2.default: + call i32 @puts(ptr @default) + br label %jt2.epilog + +jt2.epilog: + %c2cmp = icmp ne i32 %c2, 0 + br i1 %c2cmp, label %return, label %jt3.prologue, !prof !16 + +hot: + %c1 = call i32 @compute(i32 %num) + switch i32 %c1, label %jt1.default [ + i32 1, label %jt1.bb1 + i32 2, label %jt1.bb2 + ], !prof !14 + +jt1.bb1: + call i32 @puts(ptr @case1) + br label %jt1.epilog + +jt1.bb2: + call i32 @puts(ptr @case2) + br label %jt1.epilog + +jt1.default: + call i32 @puts(ptr @default) + br label %jt1.epilog + +jt1.epilog: + br label %return + +jt3.prologue: + %c3 = call i32 @cleanup(i32 %num) + switch i32 %c3, label %jt3.default [ + i32 1, label %jt3.bb1 + i32 2, label %jt3.bb2 + ], !prof !14 + +jt3.bb1: + call i32 @puts(ptr @case1) + br label %jt3.epilog + +jt3.bb2: + call i32 @puts(ptr @case2) + br label %jt3.epilog + +jt3.default: + call i32 @puts(ptr @default) + br label %jt3.epilog + +jt3.epilog: + call i32 @puts(ptr @jt3) + br label %return + +return: + ret i32 %mod3 +} + +define void @func_without_profile(i32 %num) { +entry: + switch i32 %num, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb1 + ] + +sw.bb: + call i32 @puts(ptr @str.10) + br label %sw.epilog + +sw.bb1: + call i32 @puts(ptr @str.9) + br label %sw.epilog + +sw.default: + call i32 @puts(ptr @str.11) + br label %sw.epilog + +sw.epilog: + ret void +} + +declare i32 @puts(ptr) +declare i32 @printf(ptr, ...) +declare i32 @compute(i32) +declare i32 @transform(i32) +declare i32 @cleanup(i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 230002} +!4 = !{!"MaxCount", i64 100000} +!5 = !{!"MaxInternalCount", i64 50000} +!6 = !{!"MaxFunctionCount", i64 100000} +!7 = !{!"NumCounts", i64 14} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12} +!11 = !{i32 990000, i64 10000, i32 7} +!12 = !{i32 999999, i64 1, i32 9} +!13 = !{!"function_entry_count", i64 100000} +!14 = !{!"branch_weights", i32 60000, i32 20000, i32 20000} +!15 = !{!"branch_weights", i32 1, i32 99999} +!16 = !{!"branch_weights", i32 99998, i32 1} From d15f3e828d3d3335aa9b92b9013a590b71e56b92 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Thu, 23 Jan 2025 10:41:54 +0530 Subject: [PATCH 090/208] [LoopInterchange] Constrain LI within supported loop nest depth (#118656) This patch is an extension to #115128. After profiling LLVM test-suite, I see a lot of loop nest of depth more than `MaxLoopNestDepth` which is 10. Early exit for them would save compile-time as it would avoid computing DependenceInfo and CacheCost. Please see 'bound-max-depth' branch on compile-time-tracker. --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 42 +++++--- .../LoopInterchange/bail-out-one-loop.ll | 2 +- .../LoopInterchange/deep-loop-nest.ll | 95 +++++++++++++++++++ 3 files changed, 125 insertions(+), 14 deletions(-) create mode 100644 llvm/test/Transforms/LoopInterchange/deep-loop-nest.ll diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 5bcc5e41a0e875..d366e749c7370d 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -74,8 +74,15 @@ using CharMatrix = std::vector>; } // end anonymous namespace +// Minimum loop depth supported. +static cl::opt MinLoopNestDepth( + "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, + cl::desc("Minimum depth of loop nest considered for the transform")); + // Maximum loop depth supported. -static const unsigned MaxLoopNestDepth = 10; +static cl::opt MaxLoopNestDepth( + "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, + cl::desc("Maximum depth of loop nest considered for the transform")); #ifndef NDEBUG static void printDepMatrix(CharMatrix &DepMatrix) { @@ -244,10 +251,22 @@ static void populateWorklist(Loop &L, LoopVector &LoopList) { LoopList.push_back(CurrentLoop); } -static bool hasMinimumLoopDepth(SmallVectorImpl &LoopList) { +static bool hasSupportedLoopDepth(SmallVectorImpl &LoopList, + OptimizationRemarkEmitter &ORE) { unsigned LoopNestDepth = LoopList.size(); - if (LoopNestDepth < 2) { - LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); + if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) { + LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth + << ", the supported range is [" << MinLoopNestDepth + << ", " << MaxLoopNestDepth << "].\n"); + Loop **OuterLoop = LoopList.begin(); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth", + (*OuterLoop)->getStartLoc(), + (*OuterLoop)->getHeader()) + << "Unsupported depth of loop nest, the supported range is [" + << std::to_string(MinLoopNestDepth) << ", " + << std::to_string(MaxLoopNestDepth) << "].\n"; + }); return false; } return true; @@ -435,15 +454,11 @@ struct LoopInterchange { bool processLoopList(SmallVectorImpl &LoopList) { bool Changed = false; - // Ensure minimum loop nest depth. - assert(hasMinimumLoopDepth(LoopList) && "Loop nest does not meet minimum depth."); + // Ensure proper loop nest depth. + assert(hasSupportedLoopDepth(LoopList, *ORE) && + "Unsupported depth of loop nest."); unsigned LoopNestDepth = LoopList.size(); - if (LoopNestDepth > MaxLoopNestDepth) { - LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than " - << MaxLoopNestDepth << "\n"); - return false; - } if (!isComputableLoopNest(LoopList)) { LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); return false; @@ -1735,14 +1750,15 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1"); return PreservedAnalyses::all(); } + OptimizationRemarkEmitter ORE(&F); // Ensure minimum depth of the loop nest to do the interchange. - if (!hasMinimumLoopDepth(LoopList)) + if (!hasSupportedLoopDepth(LoopList, ORE)) return PreservedAnalyses::all(); DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); std::unique_ptr CC = CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI); - OptimizationRemarkEmitter ORE(&F); + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) return PreservedAnalyses::all(); U.markLoopNestChanged(true); diff --git a/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll b/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll index 788e1b0157d80f..d1cf33acd28319 100644 --- a/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll +++ b/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll @@ -15,7 +15,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i6 ; CHECK-NOT: Delinearizing ; CHECK-NOT: Strides: ; CHECK-NOT: Terms: -; CHECK: Loop doesn't contain minimum nesting level. +; CHECK: Unsupported depth of loop nest 1, the supported range is [2, 10]. define void @foo() { entry: diff --git a/llvm/test/Transforms/LoopInterchange/deep-loop-nest.ll b/llvm/test/Transforms/LoopInterchange/deep-loop-nest.ll new file mode 100644 index 00000000000000..3252d3c0d70693 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/deep-loop-nest.ll @@ -0,0 +1,95 @@ +; RUN: opt < %s -passes=loop-interchange -pass-remarks-missed='loop-interchange' \ +; RUN: -disable-output 2>&1 | FileCheck %s + +; RUN: opt < %s -passes=loop-interchange -pass-remarks-missed='loop-interchange' \ +; RUN: -loop-interchange-max-loop-nest-depth=12 -disable-output 2>&1 | \ +; RUN: FileCheck --allow-empty -check-prefix=CHECK-MAX %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK: Unsupported depth of loop nest, the supported range is [2, 10]. +; CHECK-MAX-NOT: Unsupported depth of loop nest, the supported range is [2, 10]. +define void @big_loop_nest() { +entry: + br label %for1.header + +for1.header: + %j = phi i64 [ 0, %entry ], [ %j.next, %for1.inc ] + br label %for2.header +for2.header: + %k = phi i64 [ 0, %for1.header ], [ %k.next, %for2.inc ] + br label %for3.header +for3.header: + %l = phi i64 [ 0, %for2.header ], [ %l.next, %for3.inc ] + br label %for4.header +for4.header: + %m = phi i64 [ 0, %for3.header ], [ %m.next, %for4.inc ] + br label %for5.header +for5.header: + %n = phi i64 [ 0, %for4.header ], [ %n.next, %for5.inc ] + br label %for6.header +for6.header: + %o = phi i64 [ 0, %for5.header ], [ %o.next, %for6.inc ] + br label %for7.header +for7.header: + %p = phi i64 [ 0, %for6.header ], [ %p.next, %for7.inc ] + br label %for8.header +for8.header: + %q = phi i64 [ 0, %for7.header ], [ %q.next, %for8.inc ] + br label %for9.header +for9.header: + %r = phi i64 [ 0, %for8.header ], [ %r.next, %for9.inc ] + br label %for10.header +for10.header: + %s = phi i64 [ 0, %for9.header ], [ %s.next, %for10.inc ] + br label %for11 +for11: + %t = phi i64 [ %t.next, %for11 ], [ 0, %for10.header ] + %t.next = add nuw nsw i64 %t, 1 + %exitcond = icmp eq i64 %t.next, 99 + br i1 %exitcond, label %for1.inc, label %for11 + +for1.inc: + %j.next = add nuw nsw i64 %j, 1 + %exitcond26 = icmp eq i64 %j.next, 99 + br i1 %exitcond26, label %for2.inc, label %for1.header +for2.inc: + %k.next = add nuw nsw i64 %k, 1 + %exitcond27 = icmp eq i64 %j.next, 99 + br i1 %exitcond27, label %for3.inc, label %for2.header +for3.inc: + %l.next = add nuw nsw i64 %l, 1 + %exitcond28 = icmp eq i64 %l.next, 99 + br i1 %exitcond28, label %for4.inc, label %for3.header +for4.inc: + %m.next = add nuw nsw i64 %m, 1 + %exitcond29 = icmp eq i64 %m.next, 99 + br i1 %exitcond29, label %for5.inc, label %for4.header +for5.inc: + %n.next = add nuw nsw i64 %n, 1 + %exitcond30 = icmp eq i64 %n.next, 99 + br i1 %exitcond30, label %for6.inc, label %for5.header +for6.inc: + %o.next = add nuw nsw i64 %o, 1 + %exitcond31 = icmp eq i64 %o.next, 99 + br i1 %exitcond31, label %for7.inc, label %for6.header +for7.inc: + %p.next = add nuw nsw i64 %p, 1 + %exitcond32 = icmp eq i64 %p.next, 99 + br i1 %exitcond32, label %for8.inc, label %for7.header +for8.inc: + %q.next = add nuw nsw i64 %q, 1 + %exitcond33 = icmp eq i64 %q.next, 99 + br i1 %exitcond33, label %for9.inc, label %for8.header +for9.inc: + %r.next = add nuw nsw i64 %r, 1 + %exitcond34 = icmp eq i64 %q.next, 99 + br i1 %exitcond34, label %for10.inc, label %for9.header +for10.inc: + %s.next = add nuw nsw i64 %s, 1 + %exitcond35 = icmp eq i64 %s.next, 99 + br i1 %exitcond35, label %for.end, label %for10.header + +for.end: + ret void +} From 646f034e4e228f9d5d6a0142210e5e28f2ea7872 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 23 Jan 2025 05:16:48 +0000 Subject: [PATCH 091/208] [gn build] Port de209fa11b54 --- llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn index ab72ac4ae9f4bb..23f5d03583556a 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn @@ -228,6 +228,7 @@ static_library("CodeGen") { "StackMaps.cpp", "StackProtector.cpp", "StackSlotColoring.cpp", + "StaticDataSplitter.cpp", "SwiftErrorValueTracking.cpp", "SwitchLoweringUtils.cpp", "TailDuplication.cpp", From daa18205c6f0a3b5dd62ba2e65948e1a9182a60f Mon Sep 17 00:00:00 2001 From: Kaviya Rajendiran <67495422+kaviya2510@users.noreply.github.com> Date: Thu, 23 Jan 2025 11:14:00 +0530 Subject: [PATCH 092/208] [Flang][OpenMP] Fix copyin allocatable lowering to MLIR (#122097) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes https://github.com/llvm/llvm-project/issues/113191 Issue: [flang][OpenMP] Runtime segfault when an allocatable variable is used with copyin Rootcause: The value of the threadprivate variable is not being copied from the primary thread to the other threads within a parallel region. As a result it tries to access a null pointer inside a parallel region which causes segfault. Fix: When allocatables used with copyin clause need to ensure that, on entry to any parallel region each thread’s copy of a variable will acquire the allocation status of the primary thread, before copying the value of a threadprivate variable of the primary thread to the threadprivate variable of each other member of the team. --- flang/lib/Lower/Bridge.cpp | 27 ++++++- flang/test/Lower/OpenMP/copyin.f90 | 112 ++++++++++++++++++++++++++--- 2 files changed, 127 insertions(+), 12 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 700ca56141a324..d92dc0cf9abd62 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -1290,9 +1290,30 @@ class FirConverter : public Fortran::lower::AbstractConverter { auto loadVal = builder->create(loc, rhs); builder->create(loc, loadVal, lhs); } else if (isAllocatable && - (flags.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate) || - flags.test(Fortran::semantics::Symbol::Flag::OmpCopyIn))) { - // For firstprivate and copyin allocatable variables, RHS must be copied + flags.test(Fortran::semantics::Symbol::Flag::OmpCopyIn)) { + // For copyin allocatable variables, RHS must be copied to lhs + // only when rhs is allocated. + hlfir::Entity temp = + hlfir::derefPointersAndAllocatables(loc, *builder, rhs); + mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, temp); + mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr); + builder->genIfThenElse(loc, isAllocated) + .genThen([&]() { copyData(lhs, rhs); }) + .genElse([&]() { + fir::ExtendedValue hexv = symBoxToExtendedValue(dst); + hexv.match( + [&](const fir::MutableBoxValue &new_box) -> void { + // if the allocation status of original list item is + // unallocated, unallocate the copy if it is allocated, else + // do nothing. + Fortran::lower::genDeallocateIfAllocated(*this, new_box, loc); + }, + [&](const auto &) -> void {}); + }) + .end(); + } else if (isAllocatable && + flags.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate)) { + // For firstprivate allocatable variables, RHS must be copied // only when LHS is allocated. hlfir::Entity temp = hlfir::derefPointersAndAllocatables(loc, *builder, lhs); diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90 index 9e9ccf8e3d9142..ec4c544495c609 100644 --- a/flang/test/Lower/OpenMP/copyin.f90 +++ b/flang/test/Lower/OpenMP/copyin.f90 @@ -395,12 +395,34 @@ subroutine pointer() ! CHECK: %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref>>> -> !fir.ref>>> ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocatableEp"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref>>> -! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 realloc : !fir.box>>, !fir.ref>>> +! CHECK: %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box>>) -> !fir.heap> +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.heap>) -> i64 +! CHECK: %[[C0_I64:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %[[C0_I64]] : i64 +! CHECK: fir.if %[[VAL_9]] { +! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref>>> +! CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_5]]#0 realloc : !fir.box>>, !fir.ref>>> +! CHECK: } else { +! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref>>> +! CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box>>) -> !fir.heap> +! CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (!fir.heap>) -> i64 +! CHECK: %[[C0_I64_0:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_13:.*]] = arith.cmpi ne, %[[VAL_12]], %[[C0_I64_0]] : i64 +! CHECK: fir.if %[[VAL_13]] { +! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref>>> +! CHECK: %[[VAL_15:.*]] = fir.box_addr %[[VAL_14]] : (!fir.box>>) -> !fir.heap> +! CHECK: fir.freemem %[[VAL_15]] : !fir.heap> +! CHECK: %[[VAL_16:.*]] = fir.zero_bits !fir.heap> +! CHECK: %[[C0:.*]] = arith.constant 0 : index +! CHECK: %[[VAL_17:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1> +! CHECK: %[[VAL_18:.*]] = fir.embox %[[VAL_16]](%[[VAL_17]]) : (!fir.heap>, !fir.shape<1>) -> !fir.box>> +! CHECK: fir.store %[[VAL_18]] to %[[VAL_5]]#1 : !fir.ref>>> +! CHECK: } ! CHECK: omp.barrier -! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref>>> -! CHECK: %[[VAL_8:.*]] = fir.box_addr %[[VAL_7]] : (!fir.box>>) -> !fir.heap> -! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.heap>) -> !fir.ref> -! CHECK: fir.call @_QPsub8(%[[VAL_9]]) fastmath : (!fir.ref>) -> () +! CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref>>> +! CHECK: %[[VAL_20:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box>>) -> !fir.heap> +! CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap>) -> !fir.ref> +! CHECK: fir.call @_QPsub8(%[[VAL_21]]) fastmath : (!fir.ref>) -> () ! CHECK: omp.terminator ! CHECK: } ! CHECK: return @@ -422,7 +444,7 @@ subroutine allocatable() ! CHECK: omp.parallel { ! CHECK: %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref>> -> !fir.ref>> ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocatable2Ea"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) -! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref>> +! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref>> ! CHECK: %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box>) -> !fir.heap ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.heap) -> i64 ! CHECK: %[[VAL_9:.*]] = arith.constant 0 : i64 @@ -432,10 +454,23 @@ subroutine allocatable() ! CHECK: %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.box>) -> !fir.heap ! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.heap ! CHECK: hlfir.assign %[[VAL_13]] to %[[VAL_5]]#0 realloc : i32, !fir.ref>> -! CHECK: } +! CHECK: } else { +! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref>> +! CHECK: %[[VAL_15:.*]] = fir.box_addr %[[VAL_11]] : (!fir.box>) -> !fir.heap +! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (!fir.heap) -> i64 +! CHECK: %[[C0_I64_0:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_16]], %[[C0_I64_0]] : i64 +! CHECK: fir.if %[[VAL_17]] { +! CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref>> +! CHECK: %[[VAL_19:.*]] = fir.box_addr %[[VAL_18]] : (!fir.box>) -> !fir.heap +! CHECK: fir.freemem %[[VAL_19]] : !fir.heap +! CHECK: %[[VAL_20:.*]] = fir.zero_bits !fir.heap +! CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]] : (!fir.heap) -> !fir.box> +! CHECK: fir.store %[[VAL_21]] to %[[VAL_5]]#1 : !fir.ref>> +! CHECK: } ! CHECK: omp.barrier -! CHECK: %[[VAL_14:.*]] = arith.constant 1 : i32 -! CHECK: hlfir.assign %[[VAL_14]] to %[[VAL_5]]#0 realloc : i32, !fir.ref>> +! CHECK: %[[VAL_22:.*]] = arith.constant 1 : i32 +! CHECK: hlfir.assign %[[VAL_22]] to %[[VAL_5]]#0 realloc : i32, !fir.ref>> ! CHECK: omp.terminator ! CHECK: } ! CHECK: return @@ -448,3 +483,62 @@ subroutine allocatable2() a = 1 !$omp end parallel end subroutine + +! CHECK: func.func @_QPallocatable3() { +! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFallocatable3Ea) : !fir.ref>> +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocatable3Ea"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref>> -> !fir.ref>> +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocatable3Ea"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: %[[VAL_4:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFallocatable3Ea.alloc"} +! CHECK: %[[VAL_5:.*]] = fir.embox %[[VAL_4]] : (!fir.heap) -> !fir.box> +! CHECK: fir.store %[[VAL_5]] to %[[VAL_3]]#1 : !fir.ref>> +! CHECK: %[[C10_I32:.*]] = arith.constant 10 : i32 +! CHECK: hlfir.assign %[[C10_I32]] to %[[VAL_3]]#0 realloc : i32, !fir.ref>> +! CHECK: omp.parallel { +! CHECK: %[[VAL_6:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref>> -> !fir.ref>> +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocatable3Ea"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref>> +! CHECK: %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>) -> !fir.heap +! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (!fir.heap) -> i64 +! CHECK: %[[C10_I64:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_11:.*]] = arith.cmpi ne, %[[VAL_10]], %[[C10_I64]] : i64 +! CHECK: fir.if %[[VAL_11]] { +! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref>> +! CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_12]] : (!fir.box>) -> !fir.heap +! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_13]] : !fir.heap +! CHECK: hlfir.assign %[[VAL_14]] to %[[VAL_7]]#0 realloc : i32, !fir.ref>> +! CHECK: } else { +! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_7]]#1 : !fir.ref>> +! CHECK: %[[VAL_15:.*]] = fir.box_addr %[[VAL_12]] : (!fir.box>) -> !fir.heap +! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (!fir.heap) -> i64 +! CHECK: %[[C0_I64_0:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_16]], %[[C0_I64_0]] : i64 +! CHECK: fir.if %[[VAL_17]] { +! CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_7]]#1 : !fir.ref>> +! CHECK: %[[VAL_19:.*]] = fir.box_addr %[[VAL_18]] : (!fir.box>) -> !fir.heap +! CHECK: fir.freemem %[[VAL_19]] : !fir.heap +! CHECK: %[[VAL_20:.*]] = fir.zero_bits !fir.heap +! CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]] : (!fir.heap) -> !fir.box> +! CHECK: fir.store %[[VAL_21]] to %[[VAL_7]]#1 : !fir.ref>> +! CHECK: } +! CHECK: } +! CHECK: omp.barrier +! CHECK: %[[VAL_22:.*]] = fir.load %7#0 : !fir.ref>> +! CHECK: %[[VAL_23:.*]] = fir.box_addr %[[VAL_22]] : (!fir.box>) -> !fir.heap +! CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.heap +! CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32 +! CHECK: %[[VAL_25:.*]]= arith.addi %[[VAL_24]], %[[C1_I32]] : i32 +! CHECK: hlfir.assign %[[VAL_25]]to %[[VAL_7]]#0 realloc : i32, !fir.ref>> +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } +subroutine allocatable3() + integer, allocatable, save :: a + !$omp threadprivate(a) + allocate(a) + a = 10 + !$omp parallel copyin(a) + a = a + 1 + !$omp end parallel +end subroutine From ea49d474fd355a9fdc3d549c4f927b970181f4c9 Mon Sep 17 00:00:00 2001 From: mingmingl Date: Wed, 22 Jan 2025 21:46:51 -0800 Subject: [PATCH 093/208] Specify triple for llc test --- llvm/test/CodeGen/X86/jump-table-partition.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/jump-table-partition.ll b/llvm/test/CodeGen/X86/jump-table-partition.ll index c85338de0c3d4f..5d04df480013b8 100644 --- a/llvm/test/CodeGen/X86/jump-table-partition.ll +++ b/llvm/test/CodeGen/X86/jump-table-partition.ll @@ -3,8 +3,8 @@ ; Stop after 'finalize-isel' for simpler MIR, and lower the minimum number of ; jump table entries so 'switch' needs fewer cases to generate a jump table. -; RUN: llc -stop-after=finalize-isel -min-jump-table-entries=2 %s -o %t.mir -; RUN: llc --run-pass=static-data-splitter -stats -x mir %t.mir -o - 2>&1 | FileCheck %s --check-prefix=STAT +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -stop-after=finalize-isel -min-jump-table-entries=2 %s -o %t.mir +; RUN: llc -mtriple=x86_64-unknown-linux-gnu --run-pass=static-data-splitter -stats -x mir %t.mir -o - 2>&1 | FileCheck %s --check-prefix=STAT ; Tests stat messages are expected. ; COM: Update test to verify section suffixes when target-lowering and assembler changes are implemented. From 5d8390d48e5c03235b3c83748e4a2eec0a19ae65 Mon Sep 17 00:00:00 2001 From: mingmingl Date: Wed, 22 Jan 2025 22:33:17 -0800 Subject: [PATCH 094/208] Temporarily disable test on Fuchsia --- llvm/test/CodeGen/X86/jump-table-partition.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/CodeGen/X86/jump-table-partition.ll b/llvm/test/CodeGen/X86/jump-table-partition.ll index 5d04df480013b8..e4f8d90baafdb7 100644 --- a/llvm/test/CodeGen/X86/jump-table-partition.ll +++ b/llvm/test/CodeGen/X86/jump-table-partition.ll @@ -1,6 +1,9 @@ ; -stats requires asserts ; requires: asserts +; COM: Investigate test failure with fuchsia environment and re-enable the test. +; UNSUPPORTED: target={{.*}}-fuchsia + ; Stop after 'finalize-isel' for simpler MIR, and lower the minimum number of ; jump table entries so 'switch' needs fewer cases to generate a jump table. ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -stop-after=finalize-isel -min-jump-table-entries=2 %s -o %t.mir From c3dfd34e54c1cb9e0e6c7472a6d30d03a63f6f0a Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 22 Jan 2025 22:39:43 -0800 Subject: [PATCH 095/208] [WebAssembly] Add unreachable before catch destinations (#123915) When `try_table`'s catch clause's destination has a return type, as in the case of catch with a concrete tag, catch_ref, and catch_all_ref. For example: ```wasm block exnref try_table (catch_all_ref 0) ... end_try_table end_block ... use exnref ... ``` This code is not valid because the block's body type is not exnref. So we add an unreachable after the 'end_try_table' to make the code valid here: ```wasm block exnref try_table (catch_all_ref 0) ... end_try_table unreachable ;; Newly added end_block ``` Because 'unreachable' is a terminator we also need to split the BB. --- We need to handle the same thing for unwind mismatch handling. In the code below, we create a "trampoline BB" that will be the destination for the nested `try_table`~`end_try_table` added to fix a unwind mismatch: ```wasm try_table (catch ... ) block exnref ... try_table (catch_all_ref N) some code end_try_table ... end_block ;; Trampoline BB throw_ref end_try_table ``` While the `block` added for the trampoline BB has the return type `exnref`, its body, which contains the nested `try_table` and other code, wouldn't have the `exnref` return type. Most times it didn't become a problem because the block's body ended with something like `br` or `return`, but that may not always be the case, especially when there is a loop. So we add an `unreachable` to make the code valid here too: ```wasm try_table (catch ... ) block exnref ... try_table (catch_all_ref N) some code end_try_table ... unreachable ;; Newly added end_block ;; Trampoline BB throw_ref end_try_table ``` In this case we just append the `unreachable` at the end of the layout predecessor BB. (This was tricky to do in the first (non-mismatch) case because there `end_try_table` and `end_block` were added in the beginning of an EH pad in `placeTryTableMarker` and moving `end_try_table` and the new `unreachable` to the previous BB caused other problems.) --- This adds many `unreaachable`s to the output, but this adds `unreachable` to only a few places to see if this is working. The FileCheck lines in `exception.ll` and `cfg-stackify-eh.ll` are already heavily redacted to only leave important control-flow instructions, so I don't think it's worth adding `unreachable`s everywhere. --- .../WebAssembly/WebAssemblyCFGStackify.cpp | 107 ++++++++++++++---- .../WebAssembly/WebAssemblyInstrControl.td | 2 +- .../CodeGen/WebAssembly/cfg-stackify-eh.ll | 1 + .../CodeGen/WebAssembly/exception-legacy.mir | 3 +- llvm/test/CodeGen/WebAssembly/exception.ll | 1 + 5 files changed, 93 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 6cae0e766dbc02..bdc1cc6d652ac6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -1297,6 +1297,7 @@ void WebAssemblyCFGStackify::addNestedTryDelegate( // some code // end_try_table // ... +// unreachable // end_block ;; Trampoline BB // throw_ref // end_try_table @@ -1358,6 +1359,13 @@ WebAssemblyCFGStackify::getTrampolineBlock(MachineBasicBlock *UnwindDest) { BuildMI(TrampolineBB, EndDebugLoc, TII.get(WebAssembly::THROW_REF)) .addReg(ExnReg); + // The trampoline BB's return type is exnref because it is a target of + // catch_all_ref. But the body type of the block we just created is not. We + // add an 'unreachable' right before the 'end_block' to make the code valid. + MachineBasicBlock *TrampolineLayoutPred = TrampolineBB->getPrevNode(); + BuildMI(TrampolineLayoutPred, TrampolineLayoutPred->findBranchDebugLoc(), + TII.get(WebAssembly::UNREACHABLE)); + registerScope(Block, EndBlock); UnwindDestToTrampoline[UnwindDest] = TrampolineBB; return TrampolineBB; @@ -1465,7 +1473,7 @@ void WebAssemblyCFGStackify::addNestedTryTable(MachineInstr *RangeBegin, // - After: // pre_bb: (new) // range_end - // end_try_table: (new) + // end_try_table_bb: (new) // end_try_table // post_bb: (previous 'ehpad') // catch @@ -1523,9 +1531,9 @@ void WebAssemblyCFGStackify::addNestedTryTable(MachineInstr *RangeBegin, // end_loop // end_try_table // -// So if the unwind dest BB has a end_loop before an end_try_table, we split the -// BB with the end_loop as a separate BB before the end_try_table BB, so that -// after we fix the unwind mismatch, the code will be like: +// So if an end_try_table BB has an end_loop before the end_try_table, we split +// the BB with the end_loop as a separate BB before the end_try_table BB, so +// that after we fix the unwind mismatch, the code will be like: // bb0: // try_table // block exnref @@ -1538,10 +1546,10 @@ void WebAssemblyCFGStackify::addNestedTryTable(MachineInstr *RangeBegin, // end_block // end_try_table_bb: // end_try_table -static void splitEndLoopBB(MachineBasicBlock *UnwindDest) { - auto &MF = *UnwindDest->getParent(); +static void splitEndLoopBB(MachineBasicBlock *EndTryTableBB) { + auto &MF = *EndTryTableBB->getParent(); MachineInstr *EndTryTable = nullptr, *EndLoop = nullptr; - for (auto &MI : reverse(*UnwindDest)) { + for (auto &MI : reverse(*EndTryTableBB)) { if (MI.getOpcode() == WebAssembly::END_TRY_TABLE) { EndTryTable = &MI; continue; @@ -1555,11 +1563,11 @@ static void splitEndLoopBB(MachineBasicBlock *UnwindDest) { return; auto *EndLoopBB = MF.CreateMachineBasicBlock(); - MF.insert(UnwindDest->getIterator(), EndLoopBB); + MF.insert(EndTryTableBB->getIterator(), EndLoopBB); auto SplitPos = std::next(EndLoop->getIterator()); - EndLoopBB->splice(EndLoopBB->end(), UnwindDest, UnwindDest->begin(), + EndLoopBB->splice(EndLoopBB->end(), EndTryTableBB, EndTryTableBB->begin(), SplitPos); - EndLoopBB->addSuccessor(UnwindDest); + EndLoopBB->addSuccessor(EndTryTableBB); } bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) { @@ -1943,8 +1951,16 @@ bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) { // When end_loop is before end_try_table within the same BB in unwind // destinations, we should split the end_loop into another BB. if (!WebAssembly::WasmUseLegacyEH) - for (auto &[UnwindDest, _] : UnwindDestToTryRanges) - splitEndLoopBB(UnwindDest); + for (auto &[UnwindDest, _] : UnwindDestToTryRanges) { + auto It = EHPadToTry.find(UnwindDest); + // If UnwindDest is the fake caller block, it will not be in EHPadToTry + // map + if (It != EHPadToTry.end()) { + auto *TryTable = It->second; + auto *EndTryTable = BeginToEnd[TryTable]; + splitEndLoopBB(EndTryTable->getParent()); + } + } // Now we fix the mismatches by wrapping calls with inner try-delegates. for (auto &P : UnwindDestToTryRanges) { @@ -2179,8 +2195,15 @@ bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) { // When end_loop is before end_try_table within the same BB in unwind // destinations, we should split the end_loop into another BB. - for (auto &[_, UnwindDest] : EHPadToUnwindDest) - splitEndLoopBB(UnwindDest); + for (auto &[_, UnwindDest] : EHPadToUnwindDest) { + auto It = EHPadToTry.find(UnwindDest); + // If UnwindDest is the fake caller block, it will not be in EHPadToTry map + if (It != EHPadToTry.end()) { + auto *TryTable = It->second; + auto *EndTryTable = BeginToEnd[TryTable]; + splitEndLoopBB(EndTryTable->getParent()); + } + } NumCatchUnwindMismatches += EHPadToUnwindDest.size(); SmallPtrSet NewEndTryBBs; @@ -2372,6 +2395,48 @@ static void appendEndToFunction(MachineFunction &MF, TII.get(WebAssembly::END_FUNCTION)); } +// We added block~end_block and try_table~end_try_table markers in +// placeTryTableMarker. But When catch clause's destination has a return type, +// as in the case of catch with a concrete tag, catch_ref, and catch_all_ref. +// For example: +// block exnref +// try_table (catch_all_ref 0) +// ... +// end_try_table +// end_block +// ... use exnref ... +// +// This code is not valid because the block's body type is not exnref. So we add +// an unreachable after the 'end_try_table' to make the code valid here: +// block exnref +// try_table (catch_all_ref 0) +// ... +// end_try_table +// unreachable (new) +// end_block +// +// Because 'unreachable' is a terminator we also need to split the BB. +static void addUnreachableAfterTryTables(MachineFunction &MF, + const WebAssemblyInstrInfo &TII) { + std::vector EndTryTables; + for (auto &MBB : MF) + for (auto &MI : MBB) + if (MI.getOpcode() == WebAssembly::END_TRY_TABLE) + EndTryTables.push_back(&MI); + + for (auto *EndTryTable : EndTryTables) { + auto *MBB = EndTryTable->getParent(); + auto *NewEndTryTableBB = MF.CreateMachineBasicBlock(); + MF.insert(MBB->getIterator(), NewEndTryTableBB); + auto SplitPos = std::next(EndTryTable->getIterator()); + NewEndTryTableBB->splice(NewEndTryTableBB->end(), MBB, MBB->begin(), + SplitPos); + NewEndTryTableBB->addSuccessor(MBB); + BuildMI(NewEndTryTableBB, EndTryTable->getDebugLoc(), + TII.get(WebAssembly::UNREACHABLE)); + } +} + /// Insert BLOCK/LOOP/TRY/TRY_TABLE markers at appropriate places. void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { // We allocate one more than the number of blocks in the function to @@ -2398,13 +2463,17 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { } } - // Fix mismatches in unwind destinations induced by linearizing the code. if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && MF.getFunction().hasPersonalityFn()) { - bool MismatchFixed = fixCallUnwindMismatches(MF); - MismatchFixed |= fixCatchUnwindMismatches(MF); - if (MismatchFixed) - recalculateScopeTops(MF); + const auto &TII = *MF.getSubtarget().getInstrInfo(); + // Add an 'unreachable' after 'end_try_table's. + addUnreachableAfterTryTables(MF, TII); + // Fix mismatches in unwind destinations induced by linearizing the code. + fixCallUnwindMismatches(MF); + fixCatchUnwindMismatches(MF); + // addUnreachableAfterTryTables and fixUnwindMismatches create new BBs, so + // we need to recalculate ScopeTops. + recalculateScopeTops(MF); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td index b68dd8809bb920..ed6aec1ab33e3f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -144,7 +144,7 @@ defm THROW_REF : I<(outs), (ins EXNREF:$exn), (outs), (ins), [], "throw_ref \t$exn", "throw_ref", 0x0a>; } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 -// Region within which an exception is caught: try / end_try +// Region within which an exception is caught: try_table / end_try_table let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in { defm TRY_TABLE : I<(outs), (ins Signature:$sig, variable_ops), (outs), (ins Signature:$sig, catch_list:$cal), [], diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll index 683b03d16d57bd..98de9a267b95a5 100644 --- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll +++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll @@ -857,6 +857,7 @@ invoke.cont: ; preds = %entry ; NOSORT: loop ; NOSORT: call foo ; NOSORT: end_loop +; NOSORT: unreachable ; NOSORT: end_block # label[[L3]]: ; NOSORT: throw_ref ; NOSORT: end_try_table diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir index d6f734c64acd69..9273ceeadd0e73 100644 --- a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir +++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir @@ -78,7 +78,8 @@ body: | EH_LABEL CATCHRET %bb.2, %bb.1, implicit-def dead $arguments - ; CHECK: bb.2 + ; This BB should remain (it will be renumbered to bb.1) + ; CHECK: bb.1 bb.2: ; predecessors: %bb.0, %bb.1 RETURN implicit-def dead $arguments diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll index d6f3ffc8c33cb1..304664b622e800 100644 --- a/llvm/test/CodeGen/WebAssembly/exception.ll +++ b/llvm/test/CodeGen/WebAssembly/exception.ll @@ -38,6 +38,7 @@ define void @throw(ptr %p) { ; CHECK: call foo ; CHECK: br 2 ; CHECK: end_try_table +; CHECK: unreachable ; CHECK: end_block ; CHECK: local.set 2 ; CHECK: local.get 0 From ba174855203403f6c3e2a46bdd79dbb3e27ac6a4 Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Thu, 23 Jan 2025 01:43:44 -0500 Subject: [PATCH 096/208] [clang][CodeComplete] Use HeuristicResolver to resolve DependentNameTypes (#123818) Fixes https://github.com/clangd/clangd/issues/1249 --- clang/docs/ReleaseNotes.rst | 7 ++++++ clang/lib/Sema/SemaCodeComplete.cpp | 24 +++++++++++++++------ clang/test/CodeCompletion/member-access.cpp | 16 ++++++++++++++ 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 75931bb25f06d9..a03f42ab910edd 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1272,6 +1272,13 @@ libclang - Added ``clang_getOffsetOfBase``, which allows computing the offset of a base class in a class's layout. + +Code Completion +--------------- + +- Use ``HeuristicResolver`` (upstreamed from clangd) to improve code completion results + in dependent code + Static Analyzer --------------- diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 69cda6e68bd36b..58f3efbe0daf89 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -5736,11 +5736,19 @@ class ConceptInfo { // In particular, when E->getType() is DependentTy, try to guess a likely type. // We accept some lossiness (like dropping parameters). // We only try to handle common expressions on the LHS of MemberExpr. -QualType getApproximateType(const Expr *E) { +QualType getApproximateType(const Expr *E, HeuristicResolver &Resolver) { if (E->getType().isNull()) return QualType(); E = E->IgnoreParenImpCasts(); QualType Unresolved = E->getType(); + // Resolve DependentNameType + if (const auto *DNT = Unresolved->getAs()) { + if (auto Decls = Resolver.resolveDependentNameType(DNT); + Decls.size() == 1) { + if (const auto *TD = dyn_cast(Decls[0])) + return QualType(TD->getTypeForDecl(), 0); + } + } // We only resolve DependentTy, or undeduced autos (including auto* etc). if (!Unresolved->isSpecificBuiltinType(BuiltinType::Dependent)) { AutoType *Auto = Unresolved->getContainedAutoType(); @@ -5749,7 +5757,7 @@ QualType getApproximateType(const Expr *E) { } // A call: approximate-resolve callee to a function type, get its return type if (const CallExpr *CE = llvm::dyn_cast(E)) { - QualType Callee = getApproximateType(CE->getCallee()); + QualType Callee = getApproximateType(CE->getCallee(), Resolver); if (Callee.isNull() || Callee->isSpecificPlaceholderType(BuiltinType::BoundMember)) Callee = Expr::findBoundMemberType(CE->getCallee()); @@ -5792,7 +5800,7 @@ QualType getApproximateType(const Expr *E) { if (const auto *CDSME = llvm::dyn_cast(E)) { QualType Base = CDSME->isImplicitAccess() ? CDSME->getBaseType() - : getApproximateType(CDSME->getBase()); + : getApproximateType(CDSME->getBase(), Resolver); if (CDSME->isArrow() && !Base.isNull()) Base = Base->getPointeeType(); // could handle unique_ptr etc here? auto *RD = @@ -5813,14 +5821,15 @@ QualType getApproximateType(const Expr *E) { if (const auto *DRE = llvm::dyn_cast(E)) { if (const auto *VD = llvm::dyn_cast(DRE->getDecl())) { if (VD->hasInit()) - return getApproximateType(VD->getInit()); + return getApproximateType(VD->getInit(), Resolver); } } if (const auto *UO = llvm::dyn_cast(E)) { if (UO->getOpcode() == UnaryOperatorKind::UO_Deref) { // We recurse into the subexpression because it could be of dependent // type. - if (auto Pointee = getApproximateType(UO->getSubExpr())->getPointeeType(); + if (auto Pointee = + getApproximateType(UO->getSubExpr(), Resolver)->getPointeeType(); !Pointee.isNull()) return Pointee; // Our caller expects a non-null result, even though the SubType is @@ -5857,7 +5866,8 @@ void SemaCodeCompletion::CodeCompleteMemberReferenceExpr( SemaRef.PerformMemberExprBaseConversion(Base, IsArrow); if (ConvertedBase.isInvalid()) return; - QualType ConvertedBaseType = getApproximateType(ConvertedBase.get()); + QualType ConvertedBaseType = + getApproximateType(ConvertedBase.get(), Resolver); enum CodeCompletionContext::Kind contextKind; @@ -5896,7 +5906,7 @@ void SemaCodeCompletion::CodeCompleteMemberReferenceExpr( return false; Base = ConvertedBase.get(); - QualType BaseType = getApproximateType(Base); + QualType BaseType = getApproximateType(Base, Resolver); if (BaseType.isNull()) return false; ExprValueKind BaseKind = Base->getValueKind(); diff --git a/clang/test/CodeCompletion/member-access.cpp b/clang/test/CodeCompletion/member-access.cpp index ab6dc69bf2923d..bf35f7ad021f71 100644 --- a/clang/test/CodeCompletion/member-access.cpp +++ b/clang/test/CodeCompletion/member-access.cpp @@ -401,3 +401,19 @@ struct node { } }; } + +namespace dependent_nested_class { +template +struct Foo { + struct Bar { + int field; + }; +}; +template +void f() { + typename Foo::Bar bar; + bar.field; + // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:415:7 %s -o - | FileCheck -check-prefix=CHECK-DEPENDENT-NESTEDCLASS %s + // CHECK-DEPENDENT-NESTEDCLASS: [#int#]field +} +} From 220004d2f8692e3a224dc75f7a7c6001711d3d58 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Thu, 23 Jan 2025 15:00:08 +0800 Subject: [PATCH 097/208] [GISel] Add more FP opcodes to CSE (#123949) Resubmit, previously PR has compilation issues. --- llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp | 10 ++ llvm/unittests/CodeGen/GlobalISel/CSETest.cpp | 132 ++++++++++++++++++ 2 files changed, 142 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp index 0ac4a8a0aa910b..3a9069848ca1db 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp @@ -65,6 +65,16 @@ bool CSEConfigFull::shouldCSEOpc(unsigned Opc) { case TargetOpcode::G_BUILD_VECTOR: case TargetOpcode::G_BUILD_VECTOR_TRUNC: case TargetOpcode::G_SEXT_INREG: + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FABS: + // TODO: support G_FNEG. + case TargetOpcode::G_FMAXNUM: + case TargetOpcode::G_FMINNUM: + case TargetOpcode::G_FMAXNUM_IEEE: + case TargetOpcode::G_FMINNUM_IEEE: return true; } return false; diff --git a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp index 822707a1f4ed32..cd6e32311a9eee 100644 --- a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp @@ -75,6 +75,138 @@ TEST_F(AArch64GISelMITest, TestCSE) { auto MIBUnmerge2 = CSEB.buildUnmerge({s32, s32}, Copies[0]); EXPECT_TRUE(&*MIBUnmerge == &*MIBUnmerge2); + // Check G_FADD + { + auto MIBFAdd = CSEB.buildFAdd(s32, Copies[0], Copies[1]); + auto MIBFAdd2 = CSEB.buildFAdd(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFAdd == &*MIBFAdd2); + + auto MIBFAdd3 = + CSEB.buildFAdd(s32, Copies[0], Copies[1], MachineInstr::FmNsz); + EXPECT_FALSE(&*MIBFAdd == &*MIBFAdd3); + + MIBFAdd2->setFlag(MachineInstr::FmNsz); + MIBFAdd2->clearFlag(MachineInstr::FmNsz); + EXPECT_TRUE(&*MIBFAdd == &*MIBFAdd2); + } + + // Check G_FSUB + { + auto MIBFSub = CSEB.buildFSub(s32, Copies[0], Copies[1]); + auto MIBFSub2 = CSEB.buildFSub(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFSub == &*MIBFSub2); + + auto MIBFSub3 = + CSEB.buildFSub(s32, Copies[0], Copies[1], MachineInstr::FmNoNans); + EXPECT_FALSE(&*MIBFSub == &*MIBFSub3); + + MIBFSub2->setFlag(MachineInstr::FmNoNans); + MIBFSub2->clearFlag(MachineInstr::FmNoNans); + EXPECT_TRUE(&*MIBFSub == &*MIBFSub2); + } + + // Check G_FMUL + { + auto MIBFMul = CSEB.buildFMul(s32, Copies[0], Copies[1]); + auto MIBFMul2 = CSEB.buildFMul(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFMul == &*MIBFMul2); + + auto MIBFMul3 = + CSEB.buildFMul(s32, Copies[0], Copies[1], MachineInstr::FmNoNans); + EXPECT_FALSE(&*MIBFMul == &*MIBFMul3); + + MIBFMul2->setFlag(MachineInstr::FmNoNans); + MIBFMul2->clearFlag(MachineInstr::FmNoNans); + EXPECT_TRUE(&*MIBFMul == &*MIBFMul2); + } + + // Check G_FDIV + { + auto MIBFDiv = CSEB.buildFDiv(s32, Copies[0], Copies[1]); + auto MIBFDiv2 = CSEB.buildFDiv(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFDiv == &*MIBFDiv2); + + auto MIBFDiv3 = + CSEB.buildFDiv(s32, Copies[0], Copies[1], MachineInstr::FmNoNans); + EXPECT_FALSE(&*MIBFDiv == &*MIBFDiv3); + + MIBFDiv2->setFlag(MachineInstr::FmNoNans); + MIBFDiv2->clearFlag(MachineInstr::FmNoNans); + EXPECT_TRUE(&*MIBFDiv == &*MIBFDiv2); + } + + // Check G_FABS + { + auto MIBFAbs = CSEB.buildFAbs(s32, Copies[0]); + auto MIBFAbs2 = CSEB.buildFAbs(s32, Copies[0]); + EXPECT_TRUE(&*MIBFAbs == &*MIBFAbs2); + + auto MIBFAbs3 = CSEB.buildFAbs(s32, Copies[0], MachineInstr::FmNsz); + EXPECT_FALSE(&*MIBFAbs == &*MIBFAbs3); + + MIBFAbs2->setFlag(MachineInstr::FmNsz); + MIBFAbs2->clearFlag(MachineInstr::FmNsz); + EXPECT_TRUE(&*MIBFAbs == &*MIBFAbs2); + } + + // Check G_FMINNUM/F_MAXNUM: + { + auto MIBFMinNum = CSEB.buildFMinNum(s32, Copies[0], Copies[1]); + auto MIBFMinNum2 = CSEB.buildFMinNum(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFMinNum == &*MIBFMinNum2); + + auto MIBFMinNum3 = + CSEB.buildFMinNum(s32, Copies[0], Copies[1], MachineInstr::FmNsz); + EXPECT_FALSE(&*MIBFMinNum == &*MIBFMinNum3); + + MIBFMinNum2->setFlag(MachineInstr::FmNsz); + MIBFMinNum2->clearFlag(MachineInstr::FmNsz); + EXPECT_TRUE(&*MIBFMinNum == &*MIBFMinNum2); + } + + { + auto MIBFMaxNum = CSEB.buildFMaxNum(s32, Copies[0], Copies[1]); + auto MIBFMaxNum2 = CSEB.buildFMaxNum(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFMaxNum == &*MIBFMaxNum2); + + auto MIBFMaxNum3 = + CSEB.buildFMaxNum(s32, Copies[0], Copies[1], MachineInstr::FmNsz); + EXPECT_FALSE(&*MIBFMaxNum == &*MIBFMaxNum3); + + MIBFMaxNum2->setFlag(MachineInstr::FmNsz); + MIBFMaxNum2->clearFlag(MachineInstr::FmNsz); + EXPECT_TRUE(&*MIBFMaxNum == &*MIBFMaxNum2); + } + + // Check G_FMINNUM_IEEE/F_MAXNUM_IEEE: + { + auto MIBFMinNumIEEE = CSEB.buildFMinNumIEEE(s32, Copies[0], Copies[1]); + auto MIBFMinNumIEEE2 = CSEB.buildFMinNumIEEE(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFMinNumIEEE == &*MIBFMinNumIEEE2); + + auto MIBFMinNumIEEE3 = + CSEB.buildFMinNumIEEE(s32, Copies[0], Copies[1], MachineInstr::FmNsz); + EXPECT_FALSE(&*MIBFMinNumIEEE == &*MIBFMinNumIEEE3); + + MIBFMinNumIEEE2->setFlag(MachineInstr::FmNsz); + MIBFMinNumIEEE2->clearFlag(MachineInstr::FmNsz); + EXPECT_TRUE(&*MIBFMinNumIEEE == &*MIBFMinNumIEEE2); + } + + { + auto MIBFMaxNumIEEE = CSEB.buildFMaxNumIEEE(s32, Copies[0], Copies[1]); + auto MIBFMaxNumIEEE2 = CSEB.buildFMaxNumIEEE(s32, Copies[0], Copies[1]); + EXPECT_TRUE(&*MIBFMaxNumIEEE == &*MIBFMaxNumIEEE2); + + auto MIBFMaxNumIEEE3 = + CSEB.buildFMaxNumIEEE(s32, Copies[0], Copies[1], MachineInstr::FmNsz); + EXPECT_FALSE(&*MIBFMaxNumIEEE == &*MIBFMaxNumIEEE3); + + MIBFMaxNumIEEE2->setFlag(MachineInstr::FmNsz); + MIBFMaxNumIEEE2->clearFlag(MachineInstr::FmNsz); + EXPECT_TRUE(&*MIBFMaxNumIEEE == &*MIBFMaxNumIEEE2); + } + // Check G_BUILD_VECTOR Register Reg1 = MRI->createGenericVirtualRegister(s32); Register Reg2 = MRI->createGenericVirtualRegister(s32); From 3fb8c5b43195d6e11ff0557d07e75700343d369f Mon Sep 17 00:00:00 2001 From: mconst Date: Wed, 22 Jan 2025 23:07:07 -0800 Subject: [PATCH 098/208] [X86] Fix invalid instructions on x32 with large stack frames (#124041) `X86FrameLowering::emitSPUpdate()` assumes that 64-bit targets use a 64-bit stack pointer, but that's not true on x32. When checking the stack pointer size, we need to look at `Uses64BitFramePtr` rather than `Is64Bit`. This avoids generating invalid instructions like `add esp, rcx`. For impossibly-large stack frames (4 GiB or larger with a 32-bit stack pointer), we were also generating invalid instructions like `mov eax, 5000000000`. The inline stack probe code already had a check for that situation; I've moved the check into `emitSPUpdate()`, so any attempt to allocate a 4 GiB stack frame with a 32-bit stack pointer will now trap rather than adjusting ESP by the wrong amount. This also fixes the "can't have 32-bit 16GB stack frame" assertion, which used to be triggerable by user code but is now correct. To help catch situations like this in the future, I've added `-verify-machineinstrs` to the stack clash tests that generate large stack frames. This fixes the expensive-checks buildbot failure caused by #113219. --- llvm/lib/Target/X86/X86FrameLowering.cpp | 17 ++++++++++------- llvm/test/CodeGen/X86/stack-clash-extra-huge.ll | 10 +++++----- llvm/test/CodeGen/X86/stack-clash-huge.ll | 8 ++++---- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 18de38b2d01597..47cc6a18ef8433 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -253,17 +253,19 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; - unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); + unsigned Rax = (unsigned)(Uses64BitFramePtr ? X86::RAX : X86::EAX); if (isSub && !isEAXLiveIn(MBB)) Reg = Rax; else - Reg = TRI->findDeadCallerSavedReg(MBB, MBBI); + Reg = getX86SubSuperRegister(TRI->findDeadCallerSavedReg(MBB, MBBI), + Uses64BitFramePtr ? 64 : 32); - unsigned AddSubRROpc = - isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); + unsigned AddSubRROpc = isSub ? getSUBrrOpcode(Uses64BitFramePtr) + : getADDrrOpcode(Uses64BitFramePtr); if (Reg) { - BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Uses64BitFramePtr, Offset)), + Reg) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr) @@ -279,7 +281,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, // addq %rsp, %rax // xchg %rax, (%rsp) // movq (%rsp), %rsp - assert(Is64Bit && "can't have 32-bit 16GB stack frame"); + assert(Uses64BitFramePtr && "can't have 32-bit 16GB stack frame"); BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) .addReg(Rax, RegState::Kill) .setMIFlag(Flag); @@ -289,7 +291,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, Offset = -(Offset - SlotSize); else Offset = Offset + SlotSize; - BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Uses64BitFramePtr, Offset)), + Rax) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax) diff --git a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll index 59cbcd0689fbf8..b8031056fd6b0a 100644 --- a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll +++ b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86 %s -; RUN: llc -mtriple=x86_64-linux-gnux32 < %s | FileCheck -check-prefix=CHECK-X32 %s +; RUN: llc -mtriple=x86_64-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X64 %s +; RUN: llc -mtriple=i686-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X86 %s +; RUN: llc -mtriple=x86_64-linux-gnux32 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X32 %s define i32 @foo() local_unnamed_addr #0 { ; CHECK-X64-LABEL: foo: @@ -66,8 +66,8 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-X32-NEXT: movl $1, 264(%esp) ; CHECK-X32-NEXT: movl $1, 28664(%esp) ; CHECK-X32-NEXT: movl -128(%esp), %eax -; CHECK-X32-NEXT: movabsq $4799999880, %rcx # imm = 0x11E1A2F88 -; CHECK-X32-NEXT: addq %rcx, %esp +; CHECK-X32-NEXT: movl $4799999880, %ecx # imm = 0x11E1A2F88 +; CHECK-X32-NEXT: addl %ecx, %esp ; CHECK-X32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-X32-NEXT: retq %a = alloca i32, i64 1200000000, align 16 diff --git a/llvm/test/CodeGen/X86/stack-clash-huge.ll b/llvm/test/CodeGen/X86/stack-clash-huge.ll index 03f028dfc25067..c9990773201f0b 100644 --- a/llvm/test/CodeGen/X86/stack-clash-huge.ll +++ b/llvm/test/CodeGen/X86/stack-clash-huge.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86 %s -; RUN: llc -mtriple=x86_64-linux-gnux32 < %s | FileCheck -check-prefix=CHECK-X32 %s +; RUN: llc -mtriple=x86_64-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X64 %s +; RUN: llc -mtriple=i686-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X86 %s +; RUN: llc -mtriple=x86_64-linux-gnux32 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X32 %s define i32 @foo() local_unnamed_addr #0 { ; CHECK-X64-LABEL: foo: @@ -69,7 +69,7 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-X32-NEXT: movl $1, 28664(%esp) ; CHECK-X32-NEXT: movl -128(%esp), %eax ; CHECK-X32-NEXT: movl $2399999880, %ecx # imm = 0x8F0D1788 -; CHECK-X32-NEXT: addq %rcx, %esp +; CHECK-X32-NEXT: addl %ecx, %esp ; CHECK-X32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-X32-NEXT: retq %a = alloca i32, i64 600000000, align 16 From 8eb99bbe6e8878bfd73fb301899ced6bb5dfff38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 23 Jan 2025 09:15:47 +0200 Subject: [PATCH 099/208] Reland [LLD] [COFF] Fix linking MSVC generated implib header objects (#123916) ecb5ea6a266d5cc4e05252f6db4c73613b73cc3b tried to fix cases when LLD links what seems to be import library header objects from MSVC. However, the fix seems incorrect; the review at https://reviews.llvm.org/D133627 concluded that if this (treating this kind of symbol as a common symbol) is what link.exe does, it's fine. However, this is most probably not what link.exe does. The symbol mentioned in the commit message of ecb5ea6a266d5cc4e05252f6db4c73613b73cc3b would be a common symbol with a size of around 3 GB; this is not what might have been intended. That commit tried to avoid running into the error ".idata$4 should not refer to special section 0"; that issue is fixed for a similar style of section symbols in 4a4a8a1476b1386b523dc5b292ba9a5a6748a9cf. Therefore, revert ecb5ea6a266d5cc4e05252f6db4c73613b73cc3b and extend the fix from 4a4a8a1476b1386b523dc5b292ba9a5a6748a9cf to also work for the section symbols in MSVC generated import libraries. The main detail about them, is that for symbols of type IMAGE_SYM_CLASS_SECTION, the Value field is not an offset, but it is an optional set of flags, corresponding to the Characteristics of the section header (although it may be empty). This is a reland of a previous version of this commit, earlier merged in 9457418e66766d8fafc81f85eb8045986220ca3e / #122811. The previous version failed tests when run with address sanitizer. The issue was that the synthesized coff_symbol_generic object actually will be used to access a full coff_symbol16 or coff_symbol32 struct, see DefinedCOFF::getCOFFSymbol. Therefore, we need to make a copy of the full size of either of them. --- lld/COFF/InputFiles.cpp | 43 ++++++++++++++++++++++----- lld/test/COFF/empty-section-decl.yaml | 13 ++++---- llvm/include/llvm/Object/COFF.h | 7 ++--- llvm/test/Object/coff-sec-sym.test | 20 ------------- 4 files changed, 46 insertions(+), 37 deletions(-) delete mode 100644 llvm/test/Object/coff-sec-sym.test diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 5ee73d4dc4f8b7..fe1135db636cbc 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -105,6 +105,18 @@ static bool ignoredSymbolName(StringRef name) { return name == "@feat.00" || name == "@comp.id"; } +static coff_symbol_generic *cloneSymbol(COFFSymbolRef sym) { + if (sym.isBigObj()) { + auto *copy = make( + *reinterpret_cast(sym.getRawPtr())); + return reinterpret_cast(copy); + } else { + auto *copy = make( + *reinterpret_cast(sym.getRawPtr())); + return reinterpret_cast(copy); + } +} + ArchiveFile::ArchiveFile(COFFLinkerContext &ctx, MemoryBufferRef m) : InputFile(ctx.symtab, ArchiveKind, m) {} @@ -458,9 +470,16 @@ Symbol *ObjFile::createRegular(COFFSymbolRef sym) { return nullptr; return symtab.addUndefined(name, this, false); } - if (sc) + if (sc) { + const coff_symbol_generic *symGen = sym.getGeneric(); + if (sym.isSection()) { + auto *customSymGen = cloneSymbol(sym); + customSymGen->Value = 0; + symGen = customSymGen; + } return make(this, /*Name*/ "", /*IsCOMDAT*/ false, - /*IsExternal*/ false, sym.getGeneric(), sc); + /*IsExternal*/ false, symGen, sc); + } return nullptr; } @@ -755,15 +774,23 @@ std::optional ObjFile::createDefined( memset(hdr, 0, sizeof(*hdr)); strncpy(hdr->Name, name.data(), std::min(name.size(), (size_t)COFF::NameSize)); - // We have no idea what characteristics should be assumed here; pick - // a default. This matches what is used for .idata sections in the regular - // object files in import libraries. - hdr->Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ | - IMAGE_SCN_MEM_WRITE | IMAGE_SCN_ALIGN_4BYTES; + // The Value field in a section symbol may contain the characteristics, + // or it may be zero, where we make something up (that matches what is + // used in .idata sections in the regular object files in import libraries). + if (sym.getValue()) + hdr->Characteristics = sym.getValue() | IMAGE_SCN_ALIGN_4BYTES; + else + hdr->Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA | + IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_WRITE | + IMAGE_SCN_ALIGN_4BYTES; auto *sc = make(this, hdr); chunks.push_back(sc); + + auto *symGen = cloneSymbol(sym); + // Ignore the Value offset of these symbols, as it may be a bitmask. + symGen->Value = 0; return make(this, /*name=*/"", /*isCOMDAT=*/false, - /*isExternal=*/false, sym.getGeneric(), sc); + /*isExternal=*/false, symGen, sc); } if (llvm::COFF::isReservedSectionNumber(sectionNumber)) diff --git a/lld/test/COFF/empty-section-decl.yaml b/lld/test/COFF/empty-section-decl.yaml index 320df340000289..12fe6d44ebb832 100644 --- a/lld/test/COFF/empty-section-decl.yaml +++ b/lld/test/COFF/empty-section-decl.yaml @@ -6,7 +6,7 @@ # RUN: FileCheck %s --check-prefix=MAP < %t.map # CHECK: Contents of section .itest: -# CHECK-NEXT: 180001000 0c100080 01000000 00000000 01000000 +# CHECK-NEXT: 180001000 0c100000 0c100000 00000000 01000000 # MAP: 00001000 0000000a 4 {{.*}}:(.itest$2) # MAP: 00001000 00000000 0 .itest$2 @@ -28,7 +28,10 @@ sections: Relocations: - VirtualAddress: 0 SymbolName: '.itest$4' - Type: IMAGE_REL_AMD64_ADDR64 + Type: IMAGE_REL_AMD64_ADDR32NB + - VirtualAddress: 4 + SymbolName: '.itest$6' + Type: IMAGE_REL_AMD64_ADDR32NB - Name: '.itest$6' Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ] Alignment: 2 @@ -42,13 +45,13 @@ symbols: ComplexType: IMAGE_SYM_DTYPE_NULL StorageClass: IMAGE_SYM_CLASS_SECTION - Name: '.itest$6' - Value: 0 + Value: 3221225536 SectionNumber: 2 SimpleType: IMAGE_SYM_TYPE_NULL ComplexType: IMAGE_SYM_DTYPE_NULL - StorageClass: IMAGE_SYM_CLASS_STATIC + StorageClass: IMAGE_SYM_CLASS_SECTION - Name: '.itest$4' - Value: 0 + Value: 3221225536 SectionNumber: 0 SimpleType: IMAGE_SYM_TYPE_NULL ComplexType: IMAGE_SYM_DTYPE_NULL diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h index 4de2c680f57b1a..3d0738c4090497 100644 --- a/llvm/include/llvm/Object/COFF.h +++ b/llvm/include/llvm/Object/COFF.h @@ -383,8 +383,8 @@ class COFFSymbolRef { } bool isCommon() const { - return (isExternal() || isSection()) && - getSectionNumber() == COFF::IMAGE_SYM_UNDEFINED && getValue() != 0; + return isExternal() && getSectionNumber() == COFF::IMAGE_SYM_UNDEFINED && + getValue() != 0; } bool isUndefined() const { @@ -393,8 +393,7 @@ class COFFSymbolRef { } bool isEmptySectionDeclaration() const { - return isSection() && getSectionNumber() == COFF::IMAGE_SYM_UNDEFINED && - getValue() == 0; + return isSection() && getSectionNumber() == COFF::IMAGE_SYM_UNDEFINED; } bool isWeakExternal() const { diff --git a/llvm/test/Object/coff-sec-sym.test b/llvm/test/Object/coff-sec-sym.test deleted file mode 100644 index 0b7117250150de..00000000000000 --- a/llvm/test/Object/coff-sec-sym.test +++ /dev/null @@ -1,20 +0,0 @@ -# Check that section symbol (IMAGE_SYM_CLASS_SECTION) is listed as common symbol. - -# RUN: yaml2obj %s -o %t.obj -# RUN: llvm-nm %t.obj | FileCheck %s - -# CHECK: 00000001 C foo - ---- !COFF -header: - Machine: IMAGE_FILE_MACHINE_AMD64 - Characteristics: [ ] -sections: -symbols: - - Name: foo - Value: 1 - SectionNumber: 0 - SimpleType: IMAGE_SYM_TYPE_NULL - ComplexType: IMAGE_SYM_DTYPE_NULL - StorageClass: IMAGE_SYM_CLASS_SECTION -... From cd5694ecea2da1990365f46f9737be1b29d94f0c Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Wed, 22 Jan 2025 23:19:47 -0800 Subject: [PATCH 100/208] [StrTable] Switch the option parser to `llvm::StringTable` (#123308) Now that we have a dedicated abstraction for string tables, switch the option parser library's string table over to it rather than using a raw `const char*`. Also try to use the `StringTable::Offset` type rather than a raw `unsigned` where we can to avoid accidental increments or other issues. This is based on review feedback for the initial switch of options to a string table. Happy to tweak or adjust if desired here. --- clang/lib/Frontend/CompilerInvocation.cpp | 2 +- .../Platform/MacOSX/PlatformDarwin.cpp | 3 +- llvm/include/llvm/Option/OptTable.h | 67 ++++++++++-------- llvm/lib/Option/OptTable.cpp | 70 ++++++++++--------- llvm/tools/llvm-objdump/llvm-objdump.cpp | 3 +- .../Option/OptionMarshallingTest.cpp | 3 +- llvm/utils/TableGen/OptionParserEmitter.cpp | 11 +-- 7 files changed, 88 insertions(+), 71 deletions(-) diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 58658dedbaf1ee..3bf124e4827be9 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -282,7 +282,7 @@ using ArgumentConsumer = CompilerInvocation::ArgumentConsumer; #undef OPTTABLE_STR_TABLE_CODE static llvm::StringRef lookupStrInTable(unsigned Offset) { - return &OptionStrTable[Offset]; + return OptionStrTable[Offset]; } #define SIMPLE_ENUM_VALUE_TABLE diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index 2a36f95c94d0ce..51e9a6d81b8390 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -42,6 +42,7 @@ #include "lldb/Utility/Status.h" #include "lldb/Utility/Timer.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringTable.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Threading.h" @@ -1083,7 +1084,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType( if (!version.empty() && sdk_type != XcodeSDK::Type::Linux && sdk_type != XcodeSDK::Type::XROS) { #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...) \ - llvm::StringRef opt_##VAR = &OptionStrTable[NAME_OFFSET]; \ + llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET]; \ (void)opt_##VAR; #include "clang/Driver/Options.inc" #undef OPTION diff --git a/llvm/include/llvm/Option/OptTable.h b/llvm/include/llvm/Option/OptTable.h index 38a03fef7ae124..61a58aa304ecb4 100644 --- a/llvm/include/llvm/Option/OptTable.h +++ b/llvm/include/llvm/Option/OptTable.h @@ -12,6 +12,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringTable.h" #include "llvm/Option/OptSpecifier.h" #include "llvm/Support/StringSaver.h" #include @@ -54,7 +55,7 @@ class OptTable { /// Entry for a single option instance in the option data table. struct Info { unsigned PrefixesOffset; - unsigned PrefixedNameOffset; + StringTable::Offset PrefixedNameOffset; const char *HelpText; // Help text for specific visibilities. A list of pairs, where each pair // is a list of visibilities and a specific help string for those @@ -80,34 +81,37 @@ class OptTable { bool hasNoPrefix() const { return PrefixesOffset == 0; } - unsigned getNumPrefixes(ArrayRef PrefixesTable) const { - return PrefixesTable[PrefixesOffset]; + unsigned getNumPrefixes(ArrayRef PrefixesTable) const { + // We embed the number of prefixes in the value of the first offset. + return PrefixesTable[PrefixesOffset].value(); } - ArrayRef - getPrefixOffsets(ArrayRef PrefixesTable) const { - return hasNoPrefix() ? ArrayRef() + ArrayRef + getPrefixOffsets(ArrayRef PrefixesTable) const { + return hasNoPrefix() ? ArrayRef() : PrefixesTable.slice(PrefixesOffset + 1, getNumPrefixes(PrefixesTable)); } - void appendPrefixes(const char *StrTable, ArrayRef PrefixesTable, + void appendPrefixes(const StringTable &StrTable, + ArrayRef PrefixesTable, SmallVectorImpl &Prefixes) const { - for (unsigned PrefixOffset : getPrefixOffsets(PrefixesTable)) - Prefixes.push_back(&StrTable[PrefixOffset]); + for (auto PrefixOffset : getPrefixOffsets(PrefixesTable)) + Prefixes.push_back(StrTable[PrefixOffset]); } - StringRef getPrefix(const char *StrTable, ArrayRef PrefixesTable, + StringRef getPrefix(const StringTable &StrTable, + ArrayRef PrefixesTable, unsigned PrefixIndex) const { - return &StrTable[getPrefixOffsets(PrefixesTable)[PrefixIndex]]; + return StrTable[getPrefixOffsets(PrefixesTable)[PrefixIndex]]; } - StringRef getPrefixedName(const char *StrTable) const { - return &StrTable[PrefixedNameOffset]; + StringRef getPrefixedName(const StringTable &StrTable) const { + return StrTable[PrefixedNameOffset]; } - StringRef getName(const char *StrTable, - ArrayRef PrefixesTable) const { + StringRef getName(const StringTable &StrTable, + ArrayRef PrefixesTable) const { unsigned PrefixLength = hasNoPrefix() ? 0 : getPrefix(StrTable, PrefixesTable, 0).size(); return getPrefixedName(StrTable).drop_front(PrefixLength); @@ -117,13 +121,13 @@ class OptTable { private: // A unified string table for these options. Individual strings are stored as // null terminated C-strings at offsets within this table. - const char *StrTable; + const StringTable *StrTable; // A table of different sets of prefixes. Each set starts with the number of // prefixes in that set followed by that many offsets into the string table // for each of the prefix strings. This is essentially a Pascal-string style // encoding. - ArrayRef PrefixesTable; + ArrayRef PrefixesTable; /// The option information table. ArrayRef OptionInfos; @@ -161,7 +165,8 @@ class OptTable { protected: /// Initialize OptTable using Tablegen'ed OptionInfos. Child class must /// manually call \c buildPrefixChars once they are fully constructed. - OptTable(const char *StrTable, ArrayRef PrefixesTable, + OptTable(const StringTable &StrTable, + ArrayRef PrefixesTable, ArrayRef OptionInfos, bool IgnoreCase = false); /// Build (or rebuild) the PrefixChars member. @@ -171,10 +176,12 @@ class OptTable { virtual ~OptTable(); /// Return the string table used for option names. - const char *getStrTable() const { return StrTable; } + const StringTable &getStrTable() const { return *StrTable; } /// Return the prefixes table used for option names. - ArrayRef getPrefixesTable() const { return PrefixesTable; } + ArrayRef getPrefixesTable() const { + return PrefixesTable; + } /// Return the total number of option classes. unsigned getNumOptions() const { return OptionInfos.size(); } @@ -187,25 +194,25 @@ class OptTable { /// Lookup the name of the given option. StringRef getOptionName(OptSpecifier id) const { - return getInfo(id).getName(StrTable, PrefixesTable); + return getInfo(id).getName(*StrTable, PrefixesTable); } /// Lookup the prefix of the given option. StringRef getOptionPrefix(OptSpecifier id) const { const Info &I = getInfo(id); return I.hasNoPrefix() ? StringRef() - : I.getPrefix(StrTable, PrefixesTable, 0); + : I.getPrefix(*StrTable, PrefixesTable, 0); } void appendOptionPrefixes(OptSpecifier id, SmallVectorImpl &Prefixes) const { const Info &I = getInfo(id); - I.appendPrefixes(StrTable, PrefixesTable, Prefixes); + I.appendPrefixes(*StrTable, PrefixesTable, Prefixes); } /// Lookup the prefixed name of the given option. StringRef getOptionPrefixedName(OptSpecifier id) const { - return getInfo(id).getPrefixedName(StrTable); + return getInfo(id).getPrefixedName(*StrTable); } /// Get the kind of the given option. @@ -418,19 +425,21 @@ class OptTable { /// Specialization of OptTable class GenericOptTable : public OptTable { protected: - GenericOptTable(const char *StrTable, ArrayRef PrefixesTable, + GenericOptTable(const StringTable &StrTable, + ArrayRef PrefixesTable, ArrayRef OptionInfos, bool IgnoreCase = false); }; class PrecomputedOptTable : public OptTable { protected: - PrecomputedOptTable(const char *StrTable, ArrayRef PrefixesTable, + PrecomputedOptTable(const StringTable &StrTable, + ArrayRef PrefixesTable, ArrayRef OptionInfos, - ArrayRef PrefixesUnionOffsets, + ArrayRef PrefixesUnionOffsets, bool IgnoreCase = false) : OptTable(StrTable, PrefixesTable, OptionInfos, IgnoreCase) { - for (unsigned PrefixOffset : PrefixesUnionOffsets) - PrefixesUnion.push_back(&StrTable[PrefixOffset]); + for (auto PrefixOffset : PrefixesUnionOffsets) + PrefixesUnion.push_back(StrTable[PrefixOffset]); buildPrefixChars(); } }; diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp index 87e6f1f12364c2..6d10e6154147ec 100644 --- a/llvm/lib/Option/OptTable.cpp +++ b/llvm/lib/Option/OptTable.cpp @@ -33,11 +33,12 @@ using namespace llvm::opt; namespace { struct OptNameLess { - const char *StrTable; - ArrayRef PrefixesTable; + const StringTable *StrTable; + ArrayRef PrefixesTable; - explicit OptNameLess(const char *StrTable, ArrayRef PrefixesTable) - : StrTable(StrTable), PrefixesTable(PrefixesTable) {} + explicit OptNameLess(const StringTable &StrTable, + ArrayRef PrefixesTable) + : StrTable(&StrTable), PrefixesTable(PrefixesTable) {} #ifndef NDEBUG inline bool operator()(const OptTable::Info &A, @@ -45,13 +46,13 @@ struct OptNameLess { if (&A == &B) return false; - if (int Cmp = StrCmpOptionName(A.getName(StrTable, PrefixesTable), - B.getName(StrTable, PrefixesTable))) + if (int Cmp = StrCmpOptionName(A.getName(*StrTable, PrefixesTable), + B.getName(*StrTable, PrefixesTable))) return Cmp < 0; SmallVector APrefixes, BPrefixes; - A.appendPrefixes(StrTable, PrefixesTable, APrefixes); - B.appendPrefixes(StrTable, PrefixesTable, BPrefixes); + A.appendPrefixes(*StrTable, PrefixesTable, APrefixes); + B.appendPrefixes(*StrTable, PrefixesTable, BPrefixes); if (int Cmp = StrCmpOptionPrefixes(APrefixes, BPrefixes)) return Cmp < 0; @@ -68,7 +69,7 @@ struct OptNameLess { // Support lower_bound between info and an option name. inline bool operator()(const OptTable::Info &I, StringRef Name) const { // Do not fallback to case sensitive comparison. - return StrCmpOptionName(I.getName(StrTable, PrefixesTable), Name, false) < + return StrCmpOptionName(I.getName(*StrTable, PrefixesTable), Name, false) < 0; } }; @@ -76,9 +77,10 @@ struct OptNameLess { OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {} -OptTable::OptTable(const char *StrTable, ArrayRef PrefixesTable, +OptTable::OptTable(const StringTable &StrTable, + ArrayRef PrefixesTable, ArrayRef OptionInfos, bool IgnoreCase) - : StrTable(StrTable), PrefixesTable(PrefixesTable), + : StrTable(&StrTable), PrefixesTable(PrefixesTable), OptionInfos(OptionInfos), IgnoreCase(IgnoreCase) { // Explicitly zero initialize the error to work around a bug in array // value-initialization on MinGW with gcc 4.3.5. @@ -151,13 +153,13 @@ static bool isInput(const ArrayRef &Prefixes, StringRef Arg) { } /// \returns Matched size. 0 means no match. -static unsigned matchOption(const char *StrTable, - ArrayRef PrefixesTable, +static unsigned matchOption(const StringTable &StrTable, + ArrayRef PrefixesTable, const OptTable::Info *I, StringRef Str, bool IgnoreCase) { StringRef Name = I->getName(StrTable, PrefixesTable); - for (unsigned PrefixOffset : I->getPrefixOffsets(PrefixesTable)) { - StringRef Prefix = &StrTable[PrefixOffset]; + for (auto PrefixOffset : I->getPrefixOffsets(PrefixesTable)) { + StringRef Prefix = StrTable[PrefixOffset]; if (Str.starts_with(Prefix)) { StringRef Rest = Str.substr(Prefix.size()); bool Matched = IgnoreCase ? Rest.starts_with_insensitive(Name) @@ -170,13 +172,13 @@ static unsigned matchOption(const char *StrTable, } // Returns true if one of the Prefixes + In.Names matches Option -static bool optionMatches(const char *StrTable, - ArrayRef PrefixesTable, +static bool optionMatches(const StringTable &StrTable, + ArrayRef PrefixesTable, const OptTable::Info &In, StringRef Option) { StringRef Name = In.getName(StrTable, PrefixesTable); if (Option.consume_back(Name)) - for (unsigned PrefixOffset : In.getPrefixOffsets(PrefixesTable)) - if (Option == &StrTable[PrefixOffset]) + for (auto PrefixOffset : In.getPrefixOffsets(PrefixesTable)) + if (Option == StrTable[PrefixOffset]) return true; return false; } @@ -189,7 +191,7 @@ OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const { // Search all options and return possible values. for (size_t I = FirstSearchableIndex, E = OptionInfos.size(); I < E; I++) { const Info &In = OptionInfos[I]; - if (!In.Values || !optionMatches(StrTable, PrefixesTable, In, Option)) + if (!In.Values || !optionMatches(*StrTable, PrefixesTable, In, Option)) continue; SmallVector Candidates; @@ -217,9 +219,9 @@ OptTable::findByPrefix(StringRef Cur, Visibility VisibilityMask, if (In.Flags & DisableFlags) continue; - StringRef Name = In.getName(StrTable, PrefixesTable); - for (unsigned PrefixOffset : In.getPrefixOffsets(PrefixesTable)) { - StringRef Prefix = &StrTable[PrefixOffset]; + StringRef Name = In.getName(*StrTable, PrefixesTable); + for (auto PrefixOffset : In.getPrefixOffsets(PrefixesTable)) { + StringRef Prefix = (*StrTable)[PrefixOffset]; std::string S = (Twine(Prefix) + Name + "\t").str(); if (In.HelpText) S += In.HelpText; @@ -271,7 +273,7 @@ unsigned OptTable::internalFindNearest( for (const Info &CandidateInfo : ArrayRef(OptionInfos).drop_front(FirstSearchableIndex)) { - StringRef CandidateName = CandidateInfo.getName(StrTable, PrefixesTable); + StringRef CandidateName = CandidateInfo.getName(*StrTable, PrefixesTable); // We can eliminate some option prefix/name pairs as candidates right away: // * Ignore option candidates with empty names, such as "--", or names @@ -304,9 +306,9 @@ unsigned OptTable::internalFindNearest( // Consider each possible prefix for each candidate to find the most // appropriate one. For example, if a user asks for "--helm", suggest // "--help" over "-help". - for (unsigned CandidatePrefixOffset : + for (auto CandidatePrefixOffset : CandidateInfo.getPrefixOffsets(PrefixesTable)) { - StringRef CandidatePrefix = &StrTable[CandidatePrefixOffset]; + StringRef CandidatePrefix = (*StrTable)[CandidatePrefixOffset]; // If Candidate and NormalizedName have more than 'BestDistance' // characters of difference, no need to compute the edit distance, it's // going to be greater than BestDistance. Don't bother computing Candidate @@ -359,14 +361,14 @@ std::unique_ptr OptTable::parseOneArgGrouped(InputArgList &Args, StringRef Name = Str.ltrim(PrefixChars); const Info *Start = std::lower_bound(OptionInfos.data() + FirstSearchableIndex, End, Name, - OptNameLess(StrTable, PrefixesTable)); + OptNameLess(*StrTable, PrefixesTable)); const Info *Fallback = nullptr; unsigned Prev = Index; // Search for the option which matches Str. for (; Start != End; ++Start) { unsigned ArgSize = - matchOption(StrTable, PrefixesTable, Start, Str, IgnoreCase); + matchOption(*StrTable, PrefixesTable, Start, Str, IgnoreCase); if (!ArgSize) continue; @@ -449,7 +451,7 @@ std::unique_ptr OptTable::internalParseOneArg( // Search for the first next option which could be a prefix. Start = - std::lower_bound(Start, End, Name, OptNameLess(StrTable, PrefixesTable)); + std::lower_bound(Start, End, Name, OptNameLess(*StrTable, PrefixesTable)); // Options are stored in sorted order, with '\0' at the end of the // alphabet. Since the only options which can accept a string must @@ -464,7 +466,7 @@ std::unique_ptr OptTable::internalParseOneArg( // Scan for first option which is a proper prefix. for (; Start != End; ++Start) if ((ArgSize = - matchOption(StrTable, PrefixesTable, Start, Str, IgnoreCase))) + matchOption(*StrTable, PrefixesTable, Start, Str, IgnoreCase))) break; if (Start == End) break; @@ -787,15 +789,15 @@ void OptTable::internalPrintHelp( OS.flush(); } -GenericOptTable::GenericOptTable(const char *StrTable, - ArrayRef PrefixesTable, +GenericOptTable::GenericOptTable(const StringTable &StrTable, + ArrayRef PrefixesTable, ArrayRef OptionInfos, bool IgnoreCase) : OptTable(StrTable, PrefixesTable, OptionInfos, IgnoreCase) { std::set TmpPrefixesUnion; for (auto const &Info : OptionInfos.drop_front(FirstSearchableIndex)) - for (unsigned PrefixOffset : Info.getPrefixOffsets(PrefixesTable)) - TmpPrefixesUnion.insert(StringRef(&StrTable[PrefixOffset])); + for (auto PrefixOffset : Info.getPrefixOffsets(PrefixesTable)) + TmpPrefixesUnion.insert(StrTable[PrefixOffset]); PrefixesUnion.append(TmpPrefixesUnion.begin(), TmpPrefixesUnion.end()); buildPrefixChars(); } diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index 93fed8ee8e6f42..99e0440dce78d5 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -94,7 +94,8 @@ namespace { class CommonOptTable : public opt::GenericOptTable { public: - CommonOptTable(const char *StrTable, ArrayRef PrefixesTable, + CommonOptTable(const StringTable &StrTable, + ArrayRef PrefixesTable, ArrayRef OptionInfos, const char *Usage, const char *Description) : opt::GenericOptTable(StrTable, PrefixesTable, OptionInfos), diff --git a/llvm/unittests/Option/OptionMarshallingTest.cpp b/llvm/unittests/Option/OptionMarshallingTest.cpp index 08c3b019689f8c..005144b91bf7f3 100644 --- a/llvm/unittests/Option/OptionMarshallingTest.cpp +++ b/llvm/unittests/Option/OptionMarshallingTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringTable.h" #include "gtest/gtest.h" #define OPTTABLE_STR_TABLE_CODE @@ -20,7 +21,7 @@ struct OptionWithMarshallingInfo { const char *ImpliedValue; llvm::StringRef getPrefixedName() const { - return &OptionStrTable[PrefixedNameOffset]; + return OptionStrTable[PrefixedNameOffset]; } }; diff --git a/llvm/utils/TableGen/OptionParserEmitter.cpp b/llvm/utils/TableGen/OptionParserEmitter.cpp index 8b92d252392194..35a452890b0ec7 100644 --- a/llvm/utils/TableGen/OptionParserEmitter.cpp +++ b/llvm/utils/TableGen/OptionParserEmitter.cpp @@ -303,15 +303,17 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) { OS << "/////////\n"; OS << "// String table\n\n"; OS << "#ifdef OPTTABLE_STR_TABLE_CODE\n"; - Table.EmitStringLiteralDef(OS, "static constexpr char OptionStrTable[]", - /*Indent=*/""); + Table.EmitStringLiteralDef( + OS, "static constexpr llvm::StringTable OptionStrTable", + /*Indent=*/""); OS << "#endif // OPTTABLE_STR_TABLE_CODE\n\n"; // Dump prefixes. OS << "/////////\n"; OS << "// Prefixes\n\n"; OS << "#ifdef OPTTABLE_PREFIXES_TABLE_CODE\n"; - OS << "static constexpr unsigned OptionPrefixesTable[] = {\n"; + OS << "static constexpr llvm::StringTable::Offset OptionPrefixesTable[] = " + "{\n"; { // Ensure the first prefix set is always empty. assert(!Prefixes.empty() && @@ -339,7 +341,8 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) { OS << "/////////\n"; OS << "// Prefix Union\n\n"; OS << "#ifdef OPTTABLE_PREFIXES_UNION_CODE\n"; - OS << "static constexpr unsigned OptionPrefixesUnion[] = {\n"; + OS << "static constexpr llvm::StringTable::Offset OptionPrefixesUnion[] = " + "{\n"; { llvm::ListSeparator Sep(", "); for (auto Prefix : PrefixesUnion) From 2b67eceeef6e04ae5a4093bec9a0f0b048c70958 Mon Sep 17 00:00:00 2001 From: AdityaK Date: Wed, 22 Jan 2025 23:29:35 -0800 Subject: [PATCH 101/208] Android no longer supports arm < 7 (#123952) --- clang/lib/Driver/ToolChains/Arch/ARM.cpp | 3 +-- clang/test/Driver/arm-mfpu.c | 12 ------------ clang/test/Driver/linux-as.c | 5 ----- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp index f9d8aacaad234d..3aee540d501bef 100644 --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp @@ -443,14 +443,13 @@ arm::FloatABI arm::getDefaultFloatABI(const llvm::Triple &Triple) { case llvm::Triple::MuslEABIHF: case llvm::Triple::EABIHF: return FloatABI::Hard; + case llvm::Triple::Android: case llvm::Triple::GNUEABI: case llvm::Triple::GNUEABIT64: case llvm::Triple::MuslEABI: case llvm::Triple::EABI: // EABI is always AAPCS, and if it was not marked 'hard', it's softfp return FloatABI::SoftFP; - case llvm::Triple::Android: - return (SubArch >= 7) ? FloatABI::SoftFP : FloatABI::Soft; default: return FloatABI::Invalid; } diff --git a/clang/test/Driver/arm-mfpu.c b/clang/test/Driver/arm-mfpu.c index 5ea2230044dfbb..640e1b35c84b89 100644 --- a/clang/test/Driver/arm-mfpu.c +++ b/clang/test/Driver/arm-mfpu.c @@ -388,18 +388,6 @@ // CHECK-SOFT-ABI-FP-DAG: "-target-feature" "-aes" // CHECK-SOFT-ABI-FP-DAG: "-target-feature" "-fpregs" -// RUN: %clang -target arm-linux-androideabi21 %s -### -c 2>&1 \ -// RUN: | FileCheck --check-prefix=CHECK-ARM5-ANDROID-FP-DEFAULT %s -// CHECK-ARM5-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+soft-float" -// CHECK-ARM5-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+soft-float-abi" -// CHECK-ARM5-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+d32" -// CHECK-ARM5-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+vfp3" -// CHECK-ARM5-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+vfp4" -// CHECK-ARM5-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+fp-armv8" -// CHECK-ARM5-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+neon" -// CHECK-ARM5-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+sha2" -// CHECK-ARM5-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+aes" - // RUN: %clang -target armv7-linux-androideabi21 %s -### -c 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-ARM7-ANDROID-FP-DEFAULT %s // CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+soft-float" diff --git a/clang/test/Driver/linux-as.c b/clang/test/Driver/linux-as.c index fb6de85ba105b0..2b394b7cd3d412 100644 --- a/clang/test/Driver/linux-as.c +++ b/clang/test/Driver/linux-as.c @@ -106,11 +106,6 @@ // RUN: | FileCheck -check-prefix=CHECK-ARM-MFLOAT-ABI %s // CHECK-ARM-MFLOAT-ABI: as{{(.exe)?}}" "-EL" "-mfloat-abi=hard" // -// RUN: %clang -target arm-linux-androideabi -### \ -// RUN: -no-integrated-as -c %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHECK-ARM-ANDROID %s -// CHECK-ARM-ANDROID: as{{(.exe)?}}" "-EL" "-mfloat-abi=soft" -// // RUN: %clang -target arm-linux-androideabi -march=armv7-a -### \ // RUN: -no-integrated-as -c %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHECK-ARM-ANDROID-SOFTFP %s From 2a51a0d39a659feeeee57b6d1d768bf08d378c5e Mon Sep 17 00:00:00 2001 From: AdityaK Date: Wed, 22 Jan 2025 23:30:04 -0800 Subject: [PATCH 102/208] Remove reference to android-mips (#124021) --- clang/lib/Driver/ToolChains/Linux.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index c2a85be8198169..0767fe6c58796d 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -180,15 +180,6 @@ std::string Linux::getMultiarchTriple(const Driver &D, static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) { if (Triple.isMIPS()) { - if (Triple.isAndroid()) { - StringRef CPUName; - StringRef ABIName; - tools::mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName); - if (CPUName == "mips32r6") - return "libr6"; - if (CPUName == "mips32r2") - return "libr2"; - } // lib32 directory has a special meaning on MIPS targets. // It contains N32 ABI binaries. Use this folder if produce // code for N32 ABI only. From 091741a880c2df9d3d161068a12655d289633eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Thu, 23 Jan 2025 09:19:21 +0100 Subject: [PATCH 103/208] [libfuzzer] Clarify -max_len behavior on bigger files (#123095) --- compiler-rt/lib/fuzzer/FuzzerFlags.def | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/fuzzer/FuzzerFlags.def b/compiler-rt/lib/fuzzer/FuzzerFlags.def index fc3b3aa8c98ad5..b88458a497404c 100644 --- a/compiler-rt/lib/fuzzer/FuzzerFlags.def +++ b/compiler-rt/lib/fuzzer/FuzzerFlags.def @@ -14,8 +14,9 @@ FUZZER_FLAG_UNSIGNED(seed, 0, "Random seed. If 0, seed is generated.") FUZZER_FLAG_INT(runs, -1, "Number of individual test runs (-1 for infinite runs).") FUZZER_FLAG_INT(max_len, 0, "Maximum length of the test input. " - "If 0, libFuzzer tries to guess a good value based on the corpus " - "and reports it. ") + "Contents of corpus files are going to be truncated to this value. " + "If 0, libFuzzer tries to guess a good value based on the corpus " + "and reports it.") FUZZER_FLAG_INT(len_control, 100, "Try generating small inputs first, " "then try larger inputs over time. Specifies the rate at which the length " "limit is increased (smaller == faster). If 0, immediately try inputs with " From 70d7c847fd1b73c8bb453eac11a4a1ae03bb0d86 Mon Sep 17 00:00:00 2001 From: Hongren Zheng Date: Thu, 23 Jan 2025 16:37:46 +0800 Subject: [PATCH 104/208] [mlir][docs] Add usage/example of OpAsmOpInterface (#123610) This is part of https://discourse.llvm.org/t/rfc-introduce-opasm-type-attr-interface-for-pretty-print-in-asmprinter/83792. OpAsmOpInterface controls the SSA Name/Block Name and Default Dialect Prefix. This PR adds the usage of them by existing examples in MLIR. --- mlir/docs/DefiningDialects/Assembly.md | 77 +++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/mlir/docs/DefiningDialects/Assembly.md b/mlir/docs/DefiningDialects/Assembly.md index d69349390ee3ec..aa6272e873587d 100644 --- a/mlir/docs/DefiningDialects/Assembly.md +++ b/mlir/docs/DefiningDialects/Assembly.md @@ -48,4 +48,79 @@ void MyDialect::initialize() { ``` * If `getAlias` provides an alias with a trailing digit, `AsmPrinter` appends an underscore to avoid conflicts with autogenerated IDs. -* If multiple types/attributes have the same alias from `getAlias`, a number is appended to the alias to avoid conflicts. \ No newline at end of file +* If multiple types/attributes have the same alias from `getAlias`, a number is appended to the alias to avoid conflicts. + +## Suggesting SSA/Block Names + +An `Operation` can suggest the SSA name prefix using `OpAsmOpInterface`. + +For example, `arith.constant` will suggest a name like `%c42_i32` for its result: + +```tablegen +include "mlir/IR/OpAsmInterface.td" + +def Arith_ConstantOp : Op]> { +... +} +``` + +And the corresponding method: + +```cpp +// from https://github.com/llvm/llvm-project/blob/5ce271ef74dd3325993c827f496e460ced41af11/mlir/lib/Dialect/Arith/IR/ArithOps.cpp#L184 +void arith::ConstantOp::getAsmResultNames( + function_ref setNameFn) { + auto type = getType(); + if (auto intCst = llvm::dyn_cast(getValue())) { + auto intType = llvm::dyn_cast(type); + + // Sugar i1 constants with 'true' and 'false'. + if (intType && intType.getWidth() == 1) + return setNameFn(getResult(), (intCst.getInt() ? "true" : "false")); + + // Otherwise, build a complex name with the value and type. + SmallString<32> specialNameBuffer; + llvm::raw_svector_ostream specialName(specialNameBuffer); + specialName << 'c' << intCst.getValue(); + if (intType) + specialName << '_' << type; + setNameFn(getResult(), specialName.str()); + } else { + setNameFn(getResult(), "cst"); + } +} +``` + +Similarly, an `Operation` can suggest the name for its block arguments using `getAsmBlockArgumentNames` method in `OpAsmOpInterface`. + +For custom block names, `OpAsmOpInterface` has a method `getAsmBlockNames` so that +the operation can suggest a custom prefix instead of a generic `^bb0`. + +## Defining Default Dialect + +An `Operation` can indicate that the nested region in it has a default dialect prefix, and the operations in the region could elide the dialect prefix. + +For example, in a `func.func` op all `func` prefix could be omitted: + +```tablegen +include "mlir/IR/OpAsmInterface.td" + +def FuncOp : Func_Op<"func", [ + OpAsmOpInterface + ... +]> { + let extraClassDeclaration = [{ + /// Allow the dialect prefix to be omitted. + static StringRef getDefaultDialect() { return "func"; } + }]; +} +``` + +```mlir +func.func @main() { + // actually func.call + call @another() +} +``` From 4b0df28a68a4ed4ec5829fb4d8722a0e701d1796 Mon Sep 17 00:00:00 2001 From: Kadir Cetinkaya Date: Thu, 23 Jan 2025 10:06:19 +0100 Subject: [PATCH 105/208] [clang][Tooling] Prefer for atomic_* family in C++ --- .../Inclusions/Stdlib/StdSpecialSymbolMap.inc | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StdSpecialSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/StdSpecialSymbolMap.inc index 9179217dd6ca8b..0a332c99dd49fc 100644 --- a/clang/lib/Tooling/Inclusions/Stdlib/StdSpecialSymbolMap.inc +++ b/clang/lib/Tooling/Inclusions/Stdlib/StdSpecialSymbolMap.inc @@ -284,100 +284,148 @@ SYMBOL(abs, None, ) SYMBOL(atomic, std::, ) // atomic_* family symbols. is for C compatibility. SYMBOL(atomic_bool, std::, ) +SYMBOL(atomic_bool, None, ) SYMBOL(atomic_bool, None, ) SYMBOL(atomic_char, std::, ) +SYMBOL(atomic_char, None, ) SYMBOL(atomic_char, None, ) SYMBOL(atomic_char16_t, std::, ) +SYMBOL(atomic_char16_t, None, ) SYMBOL(atomic_char16_t, None, ) SYMBOL(atomic_char32_t, std::, ) +SYMBOL(atomic_char32_t, None, ) SYMBOL(atomic_char32_t, None, ) SYMBOL(atomic_char8_t, std::, ) +SYMBOL(atomic_char8_t, None, ) SYMBOL(atomic_char8_t, None, ) SYMBOL(atomic_int, std::, ) +SYMBOL(atomic_int, None, ) SYMBOL(atomic_int, None, ) SYMBOL(atomic_int16_t, std::, ) +SYMBOL(atomic_int16_t, None, ) SYMBOL(atomic_int16_t, None, ) SYMBOL(atomic_int32_t, std::, ) +SYMBOL(atomic_int32_t, None, ) SYMBOL(atomic_int32_t, None, ) SYMBOL(atomic_int64_t, std::, ) +SYMBOL(atomic_int64_t, None, ) SYMBOL(atomic_int64_t, None, ) SYMBOL(atomic_int8_t, std::, ) +SYMBOL(atomic_int8_t, None, ) SYMBOL(atomic_int8_t, None, ) SYMBOL(atomic_int_fast16_t, std::, ) +SYMBOL(atomic_int_fast16_t, None, ) SYMBOL(atomic_int_fast16_t, None, ) SYMBOL(atomic_int_fast32_t, std::, ) +SYMBOL(atomic_int_fast32_t, None, ) SYMBOL(atomic_int_fast32_t, None, ) SYMBOL(atomic_int_fast64_t, std::, ) +SYMBOL(atomic_int_fast64_t, None, ) SYMBOL(atomic_int_fast64_t, None, ) SYMBOL(atomic_int_fast8_t, std::, ) +SYMBOL(atomic_int_fast8_t, None, ) SYMBOL(atomic_int_fast8_t, None, ) SYMBOL(atomic_int_least16_t, std::, ) +SYMBOL(atomic_int_least16_t, None, ) SYMBOL(atomic_int_least16_t, None, ) SYMBOL(atomic_int_least32_t, std::, ) +SYMBOL(atomic_int_least32_t, None, ) SYMBOL(atomic_int_least32_t, None, ) SYMBOL(atomic_int_least64_t, std::, ) +SYMBOL(atomic_int_least64_t, None, ) SYMBOL(atomic_int_least64_t, None, ) SYMBOL(atomic_int_least8_t, std::, ) +SYMBOL(atomic_int_least8_t, None, ) SYMBOL(atomic_int_least8_t, None, ) SYMBOL(atomic_intmax_t, std::, ) +SYMBOL(atomic_intmax_t, None, ) SYMBOL(atomic_intmax_t, None, ) SYMBOL(atomic_intptr_t, std::, ) +SYMBOL(atomic_intptr_t, None, ) SYMBOL(atomic_intptr_t, None, ) SYMBOL(atomic_llong, std::, ) +SYMBOL(atomic_llong, None, ) SYMBOL(atomic_llong, None, ) SYMBOL(atomic_long, std::, ) +SYMBOL(atomic_long, None, ) SYMBOL(atomic_long, None, ) SYMBOL(atomic_ptrdiff_t, std::, ) +SYMBOL(atomic_ptrdiff_t, None, ) SYMBOL(atomic_ptrdiff_t, None, ) SYMBOL(atomic_schar, std::, ) +SYMBOL(atomic_schar, None, ) SYMBOL(atomic_schar, None, ) SYMBOL(atomic_short, std::, ) +SYMBOL(atomic_short, None, ) SYMBOL(atomic_short, None, ) SYMBOL(atomic_signed_lock_free, std::, ) +SYMBOL(atomic_signed_lock_free, None, ) SYMBOL(atomic_signed_lock_free, None, ) SYMBOL(atomic_size_t, std::, ) +SYMBOL(atomic_size_t, None, ) SYMBOL(atomic_size_t, None, ) SYMBOL(atomic_uchar, std::, ) +SYMBOL(atomic_uchar, None, ) SYMBOL(atomic_uchar, None, ) SYMBOL(atomic_uint, std::, ) +SYMBOL(atomic_uint, None, ) SYMBOL(atomic_uint, None, ) SYMBOL(atomic_uint16_t, std::, ) +SYMBOL(atomic_uint16_t, None, ) SYMBOL(atomic_uint16_t, None, ) SYMBOL(atomic_uint32_t, std::, ) +SYMBOL(atomic_uint32_t, None, ) SYMBOL(atomic_uint32_t, None, ) SYMBOL(atomic_uint64_t, std::, ) +SYMBOL(atomic_uint64_t, None, ) SYMBOL(atomic_uint64_t, None, ) SYMBOL(atomic_uint8_t, std::, ) +SYMBOL(atomic_uint8_t, None, ) SYMBOL(atomic_uint8_t, None, ) SYMBOL(atomic_uint_fast16_t, std::, ) +SYMBOL(atomic_uint_fast16_t, None, ) SYMBOL(atomic_uint_fast16_t, None, ) SYMBOL(atomic_uint_fast32_t, std::, ) +SYMBOL(atomic_uint_fast32_t, None, ) SYMBOL(atomic_uint_fast32_t, None, ) SYMBOL(atomic_uint_fast64_t, std::, ) +SYMBOL(atomic_uint_fast64_t, None, ) SYMBOL(atomic_uint_fast64_t, None, ) SYMBOL(atomic_uint_fast8_t, std::, ) +SYMBOL(atomic_uint_fast8_t, None, ) SYMBOL(atomic_uint_fast8_t, None, ) SYMBOL(atomic_uint_least16_t, std::, ) +SYMBOL(atomic_uint_least16_t, None, ) SYMBOL(atomic_uint_least16_t, None, ) SYMBOL(atomic_uint_least32_t, std::, ) +SYMBOL(atomic_uint_least32_t, None, ) SYMBOL(atomic_uint_least32_t, None, ) SYMBOL(atomic_uint_least64_t, std::, ) +SYMBOL(atomic_uint_least64_t, None, ) SYMBOL(atomic_uint_least64_t, None, ) SYMBOL(atomic_uint_least8_t, std::, ) +SYMBOL(atomic_uint_least8_t, None, ) SYMBOL(atomic_uint_least8_t, None, ) SYMBOL(atomic_uintmax_t, std::, ) +SYMBOL(atomic_uintmax_t, None, ) SYMBOL(atomic_uintmax_t, None, ) SYMBOL(atomic_uintptr_t, std::, ) +SYMBOL(atomic_uintptr_t, None, ) SYMBOL(atomic_uintptr_t, None, ) SYMBOL(atomic_ullong, std::, ) +SYMBOL(atomic_ullong, None, ) SYMBOL(atomic_ullong, None, ) SYMBOL(atomic_ulong, std::, ) +SYMBOL(atomic_ulong, None, ) SYMBOL(atomic_ulong, None, ) SYMBOL(atomic_unsigned_lock_free, std::, ) +SYMBOL(atomic_unsigned_lock_free, None, ) SYMBOL(atomic_unsigned_lock_free, None, ) SYMBOL(atomic_ushort, std::, ) +SYMBOL(atomic_ushort, None, ) SYMBOL(atomic_ushort, None, ) SYMBOL(atomic_wchar_t, std::, ) +SYMBOL(atomic_wchar_t, None, ) SYMBOL(atomic_wchar_t, None, ) // std::get has a few variants for different types (tuple, array, pair etc) From 778138114e9e42e28fcb51c0a38224e667a3790c Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 23 Jan 2025 09:16:09 +0000 Subject: [PATCH 106/208] [SDAG] Use BatchAAResults for querying alias analysis (AA) results (#123934) Once we get to SelectionDAG the IR should not be changing anymore, so we can use BatchAAResults rather than AAResults to cache AA queries. This should be a NFC change for targets that enable AA during codegen (such as AArch64), but also give a nice compile-time improvement in some cases. See: https://github.com/llvm/llvm-project/pull/123787#issuecomment-2606797041 Note: This follows Nikita's suggestion on #123787. --- llvm/include/llvm/Analysis/AliasAnalysis.h | 6 +++++ llvm/include/llvm/CodeGen/MachineInstr.h | 3 +++ llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 10 ++++++- llvm/include/llvm/CodeGen/SelectionDAG.h | 21 ++++++++------- llvm/include/llvm/CodeGen/SelectionDAGISel.h | 10 ++++++- llvm/lib/CodeGen/MachineInstr.cpp | 16 +++++++++--- llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 5 ++-- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 ++++++------ .../CodeGen/SelectionDAG/ScheduleDAGFast.cpp | 2 +- .../SelectionDAG/ScheduleDAGRRList.cpp | 2 +- .../SelectionDAG/ScheduleDAGSDNodes.cpp | 2 +- .../CodeGen/SelectionDAG/ScheduleDAGSDNodes.h | 2 +- .../CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp | 14 ++++------ .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 26 +++++++++---------- .../SelectionDAG/SelectionDAGBuilder.cpp | 26 +++++++++---------- .../SelectionDAG/SelectionDAGBuilder.h | 4 +-- .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 18 ++++++------- .../Target/SystemZ/SystemZISelDAGToDAG.cpp | 4 +-- 18 files changed, 112 insertions(+), 76 deletions(-) diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h index acc580f92b40a3..b192a9f5e65e7f 100644 --- a/llvm/include/llvm/Analysis/AliasAnalysis.h +++ b/llvm/include/llvm/Analysis/AliasAnalysis.h @@ -643,6 +643,9 @@ class BatchAAResults { bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal = false) { return isNoModRef(AA.getModRefInfoMask(Loc, AAQI, OrLocal)); } + bool pointsToConstantMemory(const Value *P, bool OrLocal = false) { + return pointsToConstantMemory(MemoryLocation::getBeforeOrAfter(P), OrLocal); + } ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, bool IgnoreLocals = false) { return AA.getModRefInfoMask(Loc, AAQI, IgnoreLocals); @@ -668,6 +671,9 @@ class BatchAAResults { MemoryLocation(V2, LocationSize::precise(1))) == AliasResult::MustAlias; } + bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB) { + return alias(LocA, LocB) == AliasResult::NoAlias; + } ModRefInfo callCapturesBefore(const Instruction *I, const MemoryLocation &MemLoc, DominatorTree *DT) { diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index efac83d9e1c92c..109aac44b86623 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -42,6 +42,7 @@ class DILabel; class Instruction; class MDNode; class AAResults; +class BatchAAResults; template class ArrayRef; class DIExpression; class DILocalVariable; @@ -1753,6 +1754,8 @@ class MachineInstr /// @param AA Optional alias analysis, used to compare memory operands. /// @param Other MachineInstr to check aliasing against. /// @param UseTBAA Whether to pass TBAA information to alias analysis. + bool mayAlias(BatchAAResults *AA, const MachineInstr &Other, + bool UseTBAA) const; bool mayAlias(AAResults *AA, const MachineInstr &Other, bool UseTBAA) const; /// Return true if this instruction may have an ordered diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index 822b06f080fa64..aaa10e684687c6 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SparseMultiSet.h" #include "llvm/ADT/identity.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -169,7 +170,7 @@ namespace llvm { /// Tracks the last instructions in this region using each virtual register. VReg2SUnitOperIdxMultiMap CurrentVRegUses; - AAResults *AAForDep = nullptr; + mutable std::optional AAForDep; /// Remember a generic side-effecting instruction as we proceed. /// No other SU ever gets scheduled around it (except in the special @@ -201,6 +202,13 @@ namespace llvm { /// a means of remembering which SUs depend on which memory locations. class Value2SUsMap; + /// Returns a (possibly null) pointer to the current BatchAAResults. + BatchAAResults *getAAForDep() const { + if (AAForDep.has_value()) + return &AAForDep.value(); + return nullptr; + } + /// Reduces maps in FIFO order, by N SUs. This is better than turning /// every Nth memory SU into BarrierChain in buildSchedGraph(), since /// it avoids unnecessary edges between seen SUs above the new BarrierChain, diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index ba0538f7084eec..461c0c1ead16d2 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -61,7 +61,7 @@ class Type; template struct GraphTraits; template class SmallSetVector; template struct FoldingSetTrait; -class AAResults; +class BatchAAResults; class BlockAddress; class BlockFrequencyInfo; class Constant; @@ -602,7 +602,8 @@ class SelectionDAG { /// certain types of nodes together, or eliminating superfluous nodes. The /// Level argument controls whether Combine is allowed to produce nodes and /// types that are illegal on the target. - void Combine(CombineLevel Level, AAResults *AA, CodeGenOptLevel OptLevel); + void Combine(CombineLevel Level, BatchAAResults *BatchAA, + CodeGenOptLevel OptLevel); /// This transforms the SelectionDAG into a SelectionDAG that /// only uses types natively supported by the target. @@ -1202,12 +1203,14 @@ class SelectionDAG { /* \p CI if not null is the memset call being lowered. * \p OverrideTailCall is an optional parameter that can be used to override * the tail call optimization decision. */ - SDValue - getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, - SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, - const CallInst *CI, std::optional OverrideTailCall, - MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, - const AAMDNodes &AAInfo = AAMDNodes(), AAResults *AA = nullptr); + SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, bool isVol, + bool AlwaysInline, const CallInst *CI, + std::optional OverrideTailCall, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo, + const AAMDNodes &AAInfo = AAMDNodes(), + BatchAAResults *BatchAA = nullptr); /* \p CI if not null is the memset call being lowered. * \p OverrideTailCall is an optional parameter that can be used to override @@ -1218,7 +1221,7 @@ class SelectionDAG { MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo = AAMDNodes(), - AAResults *AA = nullptr); + BatchAAResults *BatchAA = nullptr); SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index 43ba8f4c44cf9c..e9452a6dc62339 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_SELECTIONDAGISEL_H #define LLVM_CODEGEN_SELECTIONDAGISEL_H +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -52,7 +53,7 @@ class SelectionDAGISel { MachineRegisterInfo *RegInfo; SelectionDAG *CurDAG; std::unique_ptr SDB; - AAResults *AA = nullptr; + mutable std::optional BatchAA; AssumptionCache *AC = nullptr; GCFunctionInfo *GFI = nullptr; SSPLayoutInfo *SP = nullptr; @@ -81,6 +82,13 @@ class SelectionDAGISel { CodeGenOptLevel OL = CodeGenOptLevel::Default); virtual ~SelectionDAGISel(); + /// Returns a (possibly null) pointer to the current BatchAAResults. + BatchAAResults *getBatchAA() const { + if (BatchAA.has_value()) + return &BatchAA.value(); + return nullptr; + } + const TargetLowering *getTargetLowering() const { return TLI; } void initializeAnalysisResults(MachineFunctionAnalysisManager &MFAM); diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index ef36dfc4721975..8c2fab18a24ca0 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -1350,8 +1350,9 @@ bool MachineInstr::wouldBeTriviallyDead() const { return isPHI() || isSafeToMove(SawStore); } -static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA, - bool UseTBAA, const MachineMemOperand *MMOa, +static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, + BatchAAResults *AA, bool UseTBAA, + const MachineMemOperand *MMOa, const MachineMemOperand *MMOb) { // The following interface to AA is fashioned after DAGCombiner::isAlias and // operates with MachineMemOperand offset with some important assumptions: @@ -1434,7 +1435,7 @@ static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA, MemoryLocation(ValB, LocB, UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); } -bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other, +bool MachineInstr::mayAlias(BatchAAResults *AA, const MachineInstr &Other, bool UseTBAA) const { const MachineFunction *MF = getMF(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -1478,6 +1479,15 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other, return false; } +bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other, + bool UseTBAA) const { + if (AA) { + BatchAAResults BAA(*AA); + return mayAlias(&BAA, Other, UseTBAA); + } + return mayAlias(static_cast(nullptr), Other, UseTBAA); +} + /// hasOrderedMemoryRef - Return true if this instruction may have an ordered /// or volatile memory reference, or if the information describing the memory /// reference is not available. Return false if it is known to have no ordered diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 8e3e06bf57153e..cc98c52e90ea68 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -551,7 +551,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, unsigned Latency) { - if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) { + if (SUa->getInstr()->mayAlias(getAAForDep(), *SUb->getInstr(), UseTBAA)) { SDep Dep(SUa, SDep::MayAliasMem); Dep.setLatency(Latency); SUb->addPred(Dep); @@ -740,7 +740,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA, const TargetSubtargetInfo &ST = MF.getSubtarget(); bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI : ST.useAA(); - AAForDep = UseAA ? AA : nullptr; + if (UseAA && AA) + AAForDep.emplace(*AA); BarrierChain = nullptr; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 21d5e0a1b2953d..a0c703d2df8a2f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -191,8 +191,8 @@ namespace { /// candidate again. DenseMap> StoreRootCountMap; - // AA - Used for DAG load/store alias analysis. - AliasAnalysis *AA; + // BatchAA - Used for DAG load/store alias analysis. + BatchAAResults *BatchAA; /// This caches all chains that have already been processed in /// DAGCombiner::getStoreMergeCandidates() and found to have no mergeable @@ -247,9 +247,10 @@ namespace { SDValue visit(SDNode *N); public: - DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOptLevel OL) + DAGCombiner(SelectionDAG &D, BatchAAResults *BatchAA, CodeGenOptLevel OL) : DAG(D), TLI(D.getTargetLoweringInfo()), - STI(D.getSubtarget().getSelectionDAGInfo()), OptLevel(OL), AA(AA) { + STI(D.getSubtarget().getSelectionDAGInfo()), OptLevel(OL), + BatchAA(BatchAA) { ForCodeSize = DAG.shouldOptForSize(); DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel); @@ -28918,7 +28919,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { UseAA = false; #endif - if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && + if (UseAA && BatchAA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0.hasValue() && Size1.hasValue() && // Can't represent a scalable size + fixed offset in LocationSize (!Size0.isScalable() || SrcValOffset0 == 0) && @@ -28933,7 +28934,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { Size0.isScalable() ? Size0 : LocationSize::precise(Overlap0); LocationSize Loc1 = Size1.isScalable() ? Size1 : LocationSize::precise(Overlap1); - if (AA->isNoAlias( + if (BatchAA->isNoAlias( MemoryLocation(MUC0.MMO->getValue(), Loc0, UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()), MemoryLocation(MUC1.MMO->getValue(), Loc1, @@ -29239,8 +29240,8 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { } /// This is the entry point for the file. -void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, +void SelectionDAG::Combine(CombineLevel Level, BatchAAResults *BatchAA, CodeGenOptLevel OptLevel) { /// This is the main entry point to this class. - DAGCombiner(*this, AA, OptLevel).Run(Level); + DAGCombiner(*this, BatchAA, OptLevel).Run(Level); } diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 26eba4b257fb9c..fd4641ec6f124e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -118,7 +118,7 @@ void ScheduleDAGFast::Schedule() { LiveRegCycles.resize(TRI->getNumRegs(), 0); // Build the scheduling graph. - BuildSchedGraph(nullptr); + BuildSchedGraph(); LLVM_DEBUG(dump()); diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 51ee3cc681f05b..436c42f7e18fa9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -370,7 +370,7 @@ void ScheduleDAGRRList::Schedule() { assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences"); // Build the scheduling graph. - BuildSchedGraph(nullptr); + BuildSchedGraph(); LLVM_DEBUG(dump()); Topo.MarkDirty(); diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index ac6c44ec635451..d04bd6e98097ef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -536,7 +536,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() { /// are input. This SUnit graph is similar to the SelectionDAG, but /// excludes nodes that aren't interesting to scheduling, and represents /// glued together nodes with a single SUnit. -void ScheduleDAGSDNodes::BuildSchedGraph(AAResults *AA) { +void ScheduleDAGSDNodes::BuildSchedGraph() { // Cluster certain nodes which should be scheduled together. ClusterNodes(); // Populate the SUnits array. diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h index b7d25c6ccc9b06..ff5615b7658f37 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h @@ -94,7 +94,7 @@ class InstrItineraryData; /// are input. This SUnit graph is similar to the SelectionDAG, but /// excludes nodes that aren't interesting to scheduling, and represents /// flagged together nodes with a single SUnit. - void BuildSchedGraph(AAResults *AA); + void BuildSchedGraph(); /// InitNumRegDefsLeft - Determine the # of regs defined by this node. /// diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp index ae42a870ea2fe9..def0f9589f3f37 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp @@ -59,14 +59,10 @@ class ScheduleDAGVLIW : public ScheduleDAGSDNodes { /// HazardRec - The hazard recognizer to use. ScheduleHazardRecognizer *HazardRec; - /// AA - AAResults for making memory reference queries. - AAResults *AA; - public: - ScheduleDAGVLIW(MachineFunction &mf, AAResults *aa, - SchedulingPriorityQueue *availqueue) - : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) { - const TargetSubtargetInfo &STI = mf.getSubtarget(); + ScheduleDAGVLIW(MachineFunction &MF, SchedulingPriorityQueue *AvailableQueue) + : ScheduleDAGSDNodes(MF), AvailableQueue(AvailableQueue) { + const TargetSubtargetInfo &STI = MF.getSubtarget(); HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this); } @@ -91,7 +87,7 @@ void ScheduleDAGVLIW::Schedule() { << " '" << BB->getName() << "' **********\n"); // Build the scheduling graph. - BuildSchedGraph(AA); + BuildSchedGraph(); AvailableQueue->initNodes(SUnits); @@ -267,5 +263,5 @@ void ScheduleDAGVLIW::listScheduleTopDown() { /// createVLIWDAGScheduler - This creates a top-down list scheduler. ScheduleDAGSDNodes *llvm::createVLIWDAGScheduler(SelectionDAGISel *IS, CodeGenOptLevel) { - return new ScheduleDAGVLIW(*IS->MF, IS->AA, new ResourcePriorityQueue(IS)); + return new ScheduleDAGVLIW(*IS->MF, new ResourcePriorityQueue(IS)); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 743ae4895a1b1c..0f9790a10a1397 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8126,13 +8126,11 @@ static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl, } } -static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, - SDValue Chain, SDValue Dst, SDValue Src, - uint64_t Size, Align Alignment, - bool isVol, bool AlwaysInline, - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo, - const AAMDNodes &AAInfo, AAResults *AA) { +static SDValue getMemcpyLoadsAndStores( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + uint64_t Size, Align Alignment, bool isVol, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, + const AAMDNodes &AAInfo, BatchAAResults *BatchAA) { // Turn a memcpy of undef to nop. // FIXME: We need to honor volatile even is Src is undef. if (Src.isUndef()) @@ -8198,8 +8196,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, const Value *SrcVal = dyn_cast_if_present(SrcPtrInfo.V); bool isConstant = - AA && SrcVal && - AA->pointsToConstantMemory(MemoryLocation(SrcVal, Size, AAInfo)); + BatchAA && SrcVal && + BatchAA->pointsToConstantMemory(MemoryLocation(SrcVal, Size, AAInfo)); MachineMemOperand::Flags MMOFlags = isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; @@ -8584,7 +8582,8 @@ SDValue SelectionDAG::getMemcpy( SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional OverrideTailCall, MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo, AAResults *AA) { + MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo, + BatchAAResults *BatchAA) { // Check to see if we should lower the memcpy to loads and stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); @@ -8595,7 +8594,7 @@ SDValue SelectionDAG::getMemcpy( SDValue Result = getMemcpyLoadsAndStores( *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment, - isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo, AA); + isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo, BatchAA); if (Result.getNode()) return Result; } @@ -8616,7 +8615,7 @@ SDValue SelectionDAG::getMemcpy( assert(ConstantSize && "AlwaysInline requires a constant size!"); return getMemcpyLoadsAndStores( *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment, - isVol, true, DstPtrInfo, SrcPtrInfo, AAInfo, AA); + isVol, true, DstPtrInfo, SrcPtrInfo, AAInfo, BatchAA); } checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); @@ -8711,7 +8710,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, std::optional OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, - const AAMDNodes &AAInfo, AAResults *AA) { + const AAMDNodes &AAInfo, + BatchAAResults *BatchAA) { // Check to see if we should lower the memmove to loads and stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 9f1aadcb279a99..700eb26ca02a83 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1082,10 +1082,10 @@ RegsForValue::getRegsAndSizes() const { return OutVec; } -void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, +void SelectionDAGBuilder::init(GCFunctionInfo *gfi, BatchAAResults *aa, AssumptionCache *ac, const TargetLibraryInfo *li) { - AA = aa; + BatchAA = aa; AC = ac; GFI = gfi; LibInfo = li; @@ -4585,8 +4585,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { Root = getRoot(); else if (NumValues > MaxParallelChains) Root = getMemoryRoot(); - else if (AA && - AA->pointsToConstantMemory(MemoryLocation( + else if (BatchAA && + BatchAA->pointsToConstantMemory(MemoryLocation( SV, LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)), AAInfo))) { @@ -4688,8 +4688,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { const Value *SV = I.getOperand(0); Type *Ty = I.getType(); assert( - (!AA || - !AA->pointsToConstantMemory(MemoryLocation( + (!BatchAA || + !BatchAA->pointsToConstantMemory(MemoryLocation( SV, LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)), I.getAAMetadata()))) && "load_from_swift_error should not be constant memory"); @@ -4998,7 +4998,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) { // Do not serialize masked loads of constant memory with anything. MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); - bool AddToChain = !AA || !AA->pointsToConstantMemory(ML); + bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); @@ -6534,7 +6534,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, /* AlwaysInline */ false, &I, std::nullopt, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1)), - I.getAAMetadata(), AA); + I.getAAMetadata(), BatchAA); updateDAGForMaybeTailCall(MC); return; } @@ -6555,7 +6555,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, /* AlwaysInline */ true, &I, std::nullopt, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1)), - I.getAAMetadata(), AA); + I.getAAMetadata(), BatchAA); updateDAGForMaybeTailCall(MC); return; } @@ -6608,7 +6608,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, /* OverrideTailCall */ std::nullopt, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1)), - I.getAAMetadata(), AA); + I.getAAMetadata(), BatchAA); updateDAGForMaybeTailCall(MM); return; } @@ -8435,7 +8435,7 @@ void SelectionDAGBuilder::visitVPLoad( if (!Alignment) Alignment = DAG.getEVTAlign(VT); MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); - bool AddToChain = !AA || !AA->pointsToConstantMemory(ML); + bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, @@ -8564,7 +8564,7 @@ void SelectionDAGBuilder::visitVPStridedLoad( AAMDNodes AAInfo = VPIntrin.getAAMetadata(); const MDNode *Ranges = getRangeMetadata(VPIntrin); MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); - bool AddToChain = !AA || !AA->pointsToConstantMemory(ML); + bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); unsigned AS = PtrOperand->getType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( @@ -9021,7 +9021,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, bool ConstantMemory = false; // Do not serialize (non-volatile) loads of constant memory with anything. - if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) { + if (Builder.BatchAA && Builder.BatchAA->pointsToConstantMemory(PtrVal)) { Root = Builder.DAG.getEntryNode(); ConstantMemory = true; } else { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 3a8dc25e98700e..ed85deef64fa79 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -225,7 +225,7 @@ class SelectionDAGBuilder { static const unsigned LowestSDNodeOrder = 1; SelectionDAG &DAG; - AAResults *AA = nullptr; + BatchAAResults *BatchAA = nullptr; AssumptionCache *AC = nullptr; const TargetLibraryInfo *LibInfo = nullptr; @@ -280,7 +280,7 @@ class SelectionDAGBuilder { SL(std::make_unique(this, funcinfo)), FuncInfo(funcinfo), SwiftError(swifterror) {} - void init(GCFunctionInfo *gfi, AAResults *AA, AssumptionCache *AC, + void init(GCFunctionInfo *gfi, BatchAAResults *BatchAA, AssumptionCache *AC, const TargetLibraryInfo *li); /// Clear out the current SelectionDAG and the associated state and prepare diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index b416e98fe61a8b..3b1abf7f3d994f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -502,9 +502,9 @@ void SelectionDAGISel::initializeAnalysisResults( FuncInfo->BPI = nullptr; if (OptLevel != CodeGenOptLevel::None) - AA = &FAM.getResult(Fn); + BatchAA.emplace(FAM.getResult(Fn)); else - AA = nullptr; + BatchAA = std::nullopt; SP = &FAM.getResult(Fn); @@ -560,9 +560,9 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { FuncInfo->BPI = nullptr; if (OptLevel != CodeGenOptLevel::None) - AA = &MFP.getAnalysis().getAAResults(); + BatchAA.emplace(MFP.getAnalysis().getAAResults()); else - AA = nullptr; + BatchAA = std::nullopt; SP = &MFP.getAnalysis().getLayoutInfo(); @@ -581,7 +581,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { ISEL_DUMP(dbgs() << "\n\n\n=== " << FuncName << '\n'); - SDB->init(GFI, AA, AC, LibInfo); + SDB->init(GFI, getBatchAA(), AC, LibInfo); MF->setHasInlineAsm(false); @@ -955,7 +955,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine1", "DAG Combining 1", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel); + CurDAG->Combine(BeforeLegalizeTypes, getBatchAA(), OptLevel); } ISEL_DUMP(dbgs() << "\nOptimized lowered selection DAG: " @@ -1001,7 +1001,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lt", "DAG Combining after legalize types", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel); + CurDAG->Combine(AfterLegalizeTypes, getBatchAA(), OptLevel); } ISEL_DUMP(dbgs() << "\nOptimized type-legalized selection DAG: " @@ -1055,7 +1055,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel); + CurDAG->Combine(AfterLegalizeVectorOps, getBatchAA(), OptLevel); } ISEL_DUMP(dbgs() << "\nOptimized vector-legalized selection DAG: " @@ -1095,7 +1095,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine2", "DAG Combining 2", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel); + CurDAG->Combine(AfterLegalizeDAG, getBatchAA(), OptLevel); } ISEL_DUMP(dbgs() << "\nOptimized legalized selection DAG: " diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 3d90e3f6f67817..caf01ccd1ef7c0 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1498,8 +1498,8 @@ bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store, if (V1 == V2 && End1 == End2) return false; - return AA->isNoAlias(MemoryLocation(V1, End1, Load->getAAInfo()), - MemoryLocation(V2, End2, Store->getAAInfo())); + return BatchAA->isNoAlias(MemoryLocation(V1, End1, Load->getAAInfo()), + MemoryLocation(V2, End2, Store->getAAInfo())); } bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const { From d7c14c8f976fd291984e0c7eed75dd3331b1ed6d Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Thu, 23 Jan 2025 18:23:05 +0900 Subject: [PATCH 107/208] [IR] Replace of PointerType::getUnqual(Type) with opaque version (NFC) (#123909) Follow up to https://github.com/llvm/llvm-project/issues/123569 --- llvm/examples/BrainF/BrainF.cpp | 9 ++++----- llvm/include/llvm/FuzzMutate/OpDescriptor.h | 6 ++++-- llvm/lib/Analysis/ScalarEvolution.cpp | 2 +- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 4 ---- llvm/lib/CodeGen/ShadowStackGCLowering.cpp | 2 +- llvm/lib/CodeGen/SjLjEHPrepare.cpp | 4 ++-- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 10 +++------- llvm/lib/Frontend/Offloading/OffloadWrapper.cpp | 4 ++-- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 6 +++--- llvm/lib/IR/Constants.cpp | 3 ++- llvm/lib/IR/ConstantsContext.h | 2 +- llvm/lib/IR/InlineAsm.cpp | 5 +++-- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 5 ----- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 4 ++-- .../WebAssemblyLowerEmscriptenEHSjLj.cpp | 2 +- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 2 +- llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp | 10 ++++------ .../Instrumentation/DataFlowSanitizer.cpp | 4 ++-- llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp | 10 +--------- llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp | 13 ++----------- llvm/tools/bugpoint/Miscompilation.cpp | 9 ++------- 21 files changed, 41 insertions(+), 75 deletions(-) diff --git a/llvm/examples/BrainF/BrainF.cpp b/llvm/examples/BrainF/BrainF.cpp index e62cc7bd591a3f..cdd1ad872ab866 100644 --- a/llvm/examples/BrainF/BrainF.cpp +++ b/llvm/examples/BrainF/BrainF.cpp @@ -149,8 +149,7 @@ void BrainF::header(LLVMContext& C) { //declare i32 @puts(i8 *) FunctionCallee puts_func = module->getOrInsertFunction( - "puts", IntegerType::getInt32Ty(C), - PointerType::getUnqual(IntegerType::getInt8Ty(C))); + "puts", IntegerType::getInt32Ty(C), PointerType::getUnqual(C)); //brainf.aberror: aberrorbb = BasicBlock::Create(C, label, brainf_func); @@ -296,8 +295,8 @@ void BrainF::readloop(PHINode *phi, BasicBlock *oldbb, BasicBlock *testbb, builder->SetInsertPoint(bb_1); // Make part of PHI instruction now, wait until end of loop to finish - PHINode *phi_0 = PHINode::Create(PointerType::getUnqual(Int8Ty), 2, - headreg, testbb); + PHINode *phi_0 = + PHINode::Create(PointerType::getUnqual(C), 2, headreg, testbb); phi_0->addIncoming(curhead, bb_0); curhead = phi_0; @@ -451,7 +450,7 @@ void BrainF::readloop(PHINode *phi, BasicBlock *oldbb, BasicBlock *testbb, //%head.%d = phi i8 *[%head.%d, %main.%d] PHINode *phi_1 = - builder->CreatePHI(PointerType::getUnqual(Int8Ty), 1, headreg); + builder->CreatePHI(PointerType::getUnqual(C), 1, headreg); phi_1->addIncoming(head_0, testbb); curhead = phi_1; } diff --git a/llvm/include/llvm/FuzzMutate/OpDescriptor.h b/llvm/include/llvm/FuzzMutate/OpDescriptor.h index 4a3c2f767d00c8..771b711dd1b48d 100644 --- a/llvm/include/llvm/FuzzMutate/OpDescriptor.h +++ b/llvm/include/llvm/FuzzMutate/OpDescriptor.h @@ -155,7 +155,8 @@ static inline SourcePred anyPtrType() { std::vector Result; // TODO: Should these point at something? for (Type *T : Ts) - Result.push_back(PoisonValue::get(PointerType::getUnqual(T))); + Result.push_back( + PoisonValue::get(PointerType::getUnqual(T->getContext()))); return Result; }; return {Pred, Make}; @@ -175,7 +176,8 @@ static inline SourcePred sizedPtrType() { // as the pointer type will always be the same. for (Type *T : Ts) if (T->isSized()) - Result.push_back(PoisonValue::get(PointerType::getUnqual(T))); + Result.push_back( + PoisonValue::get(PointerType::getUnqual(T->getContext()))); return Result; }; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 210c7cab965edb..7d7d37b3d228dd 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -13601,7 +13601,7 @@ const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) { else return nullptr; - Type *ETy = getEffectiveSCEVType(PointerType::getUnqual(Ty)); + Type *ETy = getEffectiveSCEVType(PointerType::getUnqual(Inst->getContext())); return getSizeOfExpr(ETy, Ty); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 700eb26ca02a83..ecaa61fdc86a4d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9005,10 +9005,6 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits()); if (LoadVT.isVector()) LoadTy = FixedVectorType::get(LoadTy, LoadVT.getVectorNumElements()); - - LoadInput = ConstantExpr::getBitCast(const_cast(LoadInput), - PointerType::getUnqual(LoadTy)); - if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(const_cast(LoadInput), LoadTy, Builder.DAG.getDataLayout())) diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index d93db494e0908e..60c8372577a93b 100644 --- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -233,7 +233,7 @@ bool ShadowStackGCLoweringImpl::doInitialization(Module &M) { // Specifies length of variable length array. EltTys.push_back(Type::getInt32Ty(M.getContext())); FrameMapTy = StructType::create(EltTys, "gc_map"); - PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy); + PointerType *FrameMapPtrTy = PointerType::getUnqual(M.getContext()); // struct StackEntry { // ShadowStackEntry *Next; // Caller's stack entry. diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp index 9630ba4307cd21..b55be23e4579d5 100644 --- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp +++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp @@ -500,10 +500,10 @@ bool SjLjEHPrepareImpl::runOnFunction(Function &F) { Module &M = *F.getParent(); RegisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy)); + PointerType::getUnqual(FunctionContextTy->getContext())); UnregisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy)); + PointerType::getUnqual(FunctionContextTy->getContext())); PointerType *AllocaPtrTy = M.getDataLayout().getAllocaPtrType(M.getContext()); diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 3c0c90b62bc090..b1dadbae93fec5 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -193,8 +193,7 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { {PlatformInstanceDecl, DSOHandle}); auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT); - auto *AtExitCallbackTy = FunctionType::get(VoidTy, {}, false); - auto *AtExitCallbackPtrTy = PointerType::getUnqual(AtExitCallbackTy); + auto *AtExitCallbackPtrTy = PointerType::getUnqual(*Ctx); auto *AtExit = addHelperAndWrapper( *M, "atexit", FunctionType::get(IntTy, {AtExitCallbackPtrTy}, false), GlobalValue::HiddenVisibility, "__lljit.atexit_helper", @@ -468,12 +467,9 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { *M, GenericIRPlatformSupportTy, true, GlobalValue::ExternalLinkage, nullptr, "__lljit.platform_support_instance"); - auto *Int8Ty = Type::getInt8Ty(*Ctx); auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT); - auto *VoidTy = Type::getVoidTy(*Ctx); - auto *BytePtrTy = PointerType::getUnqual(Int8Ty); - auto *CxaAtExitCallbackTy = FunctionType::get(VoidTy, {BytePtrTy}, false); - auto *CxaAtExitCallbackPtrTy = PointerType::getUnqual(CxaAtExitCallbackTy); + auto *BytePtrTy = PointerType::getUnqual(*Ctx); + auto *CxaAtExitCallbackPtrTy = PointerType::getUnqual(*Ctx); auto *CxaAtExit = addHelperAndWrapper( *M, "__cxa_atexit", diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index d3cb5346f4ba5d..478cc8ab05d1ae 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -50,7 +50,7 @@ StructType *getDeviceImageTy(Module &M) { } PointerType *getDeviceImagePtrTy(Module &M) { - return PointerType::getUnqual(getDeviceImageTy(M)); + return PointerType::getUnqual(M.getContext()); } // struct __tgt_bin_desc { @@ -70,7 +70,7 @@ StructType *getBinDescTy(Module &M) { } PointerType *getBinDescPtrTy(Module &M) { - return PointerType::getUnqual(getBinDescTy(M)); + return PointerType::getUnqual(M.getContext()); } /// Creates binary descriptor for the given device images. Binary descriptor diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 7dbf65fbf055bd..7ef9f2fc4f49d7 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -9177,16 +9177,16 @@ void OpenMPIRBuilder::initializeTypes(Module &M) { #define OMP_TYPE(VarName, InitValue) VarName = InitValue; #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ VarName##Ty = ArrayType::get(ElemTy, ArraySize); \ - VarName##PtrTy = PointerType::getUnqual(VarName##Ty); + VarName##PtrTy = PointerType::getUnqual(Ctx); #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \ - VarName##Ptr = PointerType::getUnqual(VarName); + VarName##Ptr = PointerType::getUnqual(Ctx); #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \ T = StructType::getTypeByName(Ctx, StructName); \ if (!T) \ T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \ VarName = T; \ - VarName##Ptr = PointerType::getUnqual(T); + VarName##Ptr = PointerType::getUnqual(Ctx); #include "llvm/Frontend/OpenMP/OMPKinds.def" } diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index db5effbd9a43e7..33f4dc78c6d3f9 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -2482,7 +2482,8 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) { // Note that a non-inbounds gep is used, as null isn't within any object. Constant *GEPIdx = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1); Constant *GEP = getGetElementPtr( - Ty, Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx); + Ty, Constant::getNullValue(PointerType::getUnqual(Ty->getContext())), + GEPIdx); return getPtrToInt(GEP, Type::getInt64Ty(Ty->getContext())); } diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index 08bf3f9dff5e66..e5c9622e09927a 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -380,7 +380,7 @@ struct InlineAsmKeyType { using TypeClass = ConstantInfo::TypeClass; InlineAsm *create(TypeClass *Ty) const { - assert(PointerType::getUnqual(FTy) == Ty); + assert(PointerType::getUnqual(FTy->getContext()) == Ty); return new InlineAsm(FTy, std::string(AsmString), std::string(Constraints), HasSideEffects, IsAlignStack, AsmDialect, CanThrow); } diff --git a/llvm/lib/IR/InlineAsm.cpp b/llvm/lib/IR/InlineAsm.cpp index aeaa6a3741b949..922081468a7750 100644 --- a/llvm/lib/IR/InlineAsm.cpp +++ b/llvm/lib/IR/InlineAsm.cpp @@ -30,7 +30,7 @@ using namespace llvm; InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString, const std::string &constraints, bool hasSideEffects, bool isAlignStack, AsmDialect asmDialect, bool canThrow) - : Value(PointerType::getUnqual(FTy), Value::InlineAsmVal), + : Value(PointerType::getUnqual(FTy->getContext()), Value::InlineAsmVal), AsmString(asmString), Constraints(constraints), FTy(FTy), HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack), Dialect(asmDialect), CanThrow(canThrow) { @@ -47,7 +47,8 @@ InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString, InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects, isAlignStack, asmDialect, canThrow); LLVMContextImpl *pImpl = FTy->getContext().pImpl; - return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key); + return pImpl->InlineAsms.getOrCreate( + PointerType::getUnqual(FTy->getContext()), Key); } void InlineAsm::destroyConstant() { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index cd093317275ee9..77537df1ae053e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1954,10 +1954,8 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { Align Alignment = BasePtr->getPointerAlignment(II.getDataLayout()); - Type *VecPtrTy = PointerType::getUnqual(Ty); Value *Ptr = IC.Builder.CreateGEP(cast(Ty)->getElementType(), BasePtr, IndexBase); - Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy); CallInst *MaskedLoad = IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); MaskedLoad->takeName(&II); @@ -1986,9 +1984,6 @@ instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { Value *Ptr = IC.Builder.CreateGEP(cast(Ty)->getElementType(), BasePtr, IndexBase); - Type *VecPtrTy = PointerType::getUnqual(Ty); - Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy); - (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); return IC.eraseInstFromFunction(II); diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index d0cd38cf723636..c2c3a59ed05000 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -2323,7 +2323,7 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, Align(8)); Entry.Node = FIPtr; - Entry.Ty = PointerType::getUnqual(ArgTy); + Entry.Ty = PointerType::getUnqual(ArgTy->getContext()); } Args.push_back(Entry); return Chain; @@ -2351,7 +2351,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG, int RetFI = MFI.CreateStackObject(16, Align(8), false); RetPtr = DAG.getFrameIndex(RetFI, PtrVT); Entry.Node = RetPtr; - Entry.Ty = PointerType::getUnqual(RetTy); + Entry.Ty = PointerType::getUnqual(RetTy->getContext()); if (!Subtarget->is64Bit()) { Entry.IsSRet = true; Entry.IndirectType = RetTy; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 5aef016720cf4c..839a206033a0c6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -580,7 +580,7 @@ Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallBase *CI) { return It->second; // Put the pointer to the callee as first argument - ArgTys.push_back(PointerType::getUnqual(CalleeFTy)); + ArgTys.push_back(PointerType::getUnqual(CI->getContext())); // Add argument types ArgTys.append(CalleeFTy->param_begin(), CalleeFTy->param_end()); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 3808147fc26009..23ac55e8ce0cde 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1419,7 +1419,7 @@ struct SwitchCoroutineSplitter { SmallVector NewParams; NewParams.reserve(OldParams.size() + 1); NewParams.append(OldParams.begin(), OldParams.end()); - NewParams.push_back(PointerType::getUnqual(Shape.FrameTy)); + NewParams.push_back(PointerType::getUnqual(Shape.FrameTy->getContext())); auto *NewFnTy = FunctionType::get(OrigFnTy->getReturnType(), NewParams, OrigFnTy->isVarArg()); diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 2f171c3c981d40..e889926930082f 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1529,8 +1529,6 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, FunctionType *NewFT = FunctionType::get(CB.getFunctionType()->getReturnType(), NewArgs, CB.getFunctionType()->isVarArg()); - PointerType *NewFTPtr = PointerType::getUnqual(NewFT); - IRBuilder<> IRB(&CB); std::vector Args; Args.push_back(VCallSite.VTable); @@ -1538,11 +1536,11 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, CallBase *NewCS = nullptr; if (isa(CB)) - NewCS = IRB.CreateCall(NewFT, IRB.CreateBitCast(JT, NewFTPtr), Args); + NewCS = IRB.CreateCall(NewFT, JT, Args); else - NewCS = IRB.CreateInvoke(NewFT, IRB.CreateBitCast(JT, NewFTPtr), - cast(CB).getNormalDest(), - cast(CB).getUnwindDest(), Args); + NewCS = + IRB.CreateInvoke(NewFT, JT, cast(CB).getNormalDest(), + cast(CB).getUnwindDest(), Args); NewCS->setCallingConv(CB.getCallingConv()); AttributeList Attrs = CB.getAttributes(); diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index fd69b3f244ec81..fcabcdfb0ba9b3 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1150,9 +1150,9 @@ bool DataFlowSanitizer::initializeModule(Module &M) { Ctx = &M.getContext(); Int8Ptr = PointerType::getUnqual(*Ctx); OriginTy = IntegerType::get(*Ctx, OriginWidthBits); - OriginPtrTy = PointerType::getUnqual(OriginTy); + OriginPtrTy = PointerType::getUnqual(*Ctx); PrimitiveShadowTy = IntegerType::get(*Ctx, ShadowWidthBits); - PrimitiveShadowPtrTy = PointerType::getUnqual(PrimitiveShadowTy); + PrimitiveShadowPtrTy = PointerType::getUnqual(*Ctx); IntptrTy = DL.getIntPtrType(*Ctx); ZeroPrimitiveShadow = ConstantInt::getSigned(PrimitiveShadowTy, 0); ZeroOrigin = ConstantInt::getSigned(OriginTy, 0); diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index 1d213e2aeae5a5..b020591c203dbd 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -379,15 +379,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong( << " Retain: " << *Retain << "\n" << " Load: " << *Load << "\n"); - LLVMContext &C = Release->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *I8XX = PointerType::getUnqual(I8X); - - Value *Args[] = { Load->getPointerOperand(), New }; - if (Args[0]->getType() != I8XX) - Args[0] = new BitCastInst(Args[0], I8XX, "", Store->getIterator()); - if (Args[1]->getType() != I8X) - Args[1] = new BitCastInst(Args[1], I8X, "", Store->getIterator()); + Value *Args[] = {Load->getPointerOperand(), New}; Function *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong); CallInst *StoreStrong = objcarc::createCallInstWithColors( Decl, Args, "", Store->getIterator(), BlockColors); diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 5bfbe95fafa05e..340d55190a5e6c 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -1762,21 +1762,15 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove, DenseMap &Releases, SmallVectorImpl &DeadInsts, Module *M) { - Type *ArgTy = Arg->getType(); - Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext())); - LLVM_DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n"); // Insert the new retain and release calls. for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) { - Value *MyArg = ArgTy == ParamTy ? Arg - : new BitCastInst(Arg, ParamTy, "", - InsertPt->getIterator()); Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain); SmallVector BundleList; addOpBundleForFunclet(InsertPt->getParent(), BundleList); CallInst *Call = - CallInst::Create(Decl, MyArg, BundleList, "", InsertPt->getIterator()); + CallInst::Create(Decl, Arg, BundleList, "", InsertPt->getIterator()); Call->setDoesNotThrow(); Call->setTailCall(); @@ -1786,14 +1780,11 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove, << *InsertPt << "\n"); } for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) { - Value *MyArg = ArgTy == ParamTy ? Arg - : new BitCastInst(Arg, ParamTy, "", - InsertPt->getIterator()); Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release); SmallVector BundleList; addOpBundleForFunclet(InsertPt->getParent(), BundleList); CallInst *Call = - CallInst::Create(Decl, MyArg, BundleList, "", InsertPt->getIterator()); + CallInst::Create(Decl, Arg, BundleList, "", InsertPt->getIterator()); // Attach a clang.imprecise_release metadata tag, if appropriate. if (MDNode *M = ReleasesToMove.ReleaseMetadata) Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M); diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp index b165b8220c20bd..4cf7de3659b8a4 100644 --- a/llvm/tools/bugpoint/Miscompilation.cpp +++ b/llvm/tools/bugpoint/Miscompilation.cpp @@ -894,18 +894,13 @@ CleanupAndPrepareModules(BugDriver &BD, std::unique_ptr Test, CallInst *Resolver = CallInst::Create(resolverFunc, ResolverArgs, "resolver", LookupBB); - // Cast the result from the resolver to correctly-typed function. - CastInst *CastedResolver = new BitCastInst( - Resolver, PointerType::getUnqual(F->getFunctionType()), - "resolverCast", LookupBB); - // Save the value in our cache. - new StoreInst(CastedResolver, Cache, LookupBB); + new StoreInst(Resolver, Cache, LookupBB); BranchInst::Create(DoCallBB, LookupBB); PHINode *FuncPtr = PHINode::Create(NullPtr->getType(), 2, "fp", DoCallBB); - FuncPtr->addIncoming(CastedResolver, LookupBB); + FuncPtr->addIncoming(Resolver, LookupBB); FuncPtr->addIncoming(CachedVal, EntryBB); // Save the argument list. From 9fd92634749c75b39be829c22240567ccda3ffce Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Thu, 23 Jan 2025 04:26:02 -0500 Subject: [PATCH 108/208] [libc++abi] Remove support for Android 4 and older (#124054) --- libcxxabi/src/abort_message.cpp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/libcxxabi/src/abort_message.cpp b/libcxxabi/src/abort_message.cpp index 9e5a984807e2c3..d51d9d87d35e02 100644 --- a/libcxxabi/src/abort_message.cpp +++ b/libcxxabi/src/abort_message.cpp @@ -12,13 +12,8 @@ #include "abort_message.h" #ifdef __BIONIC__ -# include -# if __ANDROID_API__ >= 21 -# include - extern "C" void android_set_abort_message(const char* msg); -# else -# include -# endif // __ANDROID_API__ >= 21 +# include +extern "C" void android_set_abort_message(const char* msg); #endif // __BIONIC__ #if defined(__APPLE__) && __has_include() @@ -59,7 +54,6 @@ void __abort_message(const char* format, ...) vasprintf(&buffer, format, list); va_end(list); -# if __ANDROID_API__ >= 21 // Show error in tombstone. android_set_abort_message(buffer); @@ -67,12 +61,6 @@ void __abort_message(const char* format, ...) openlog("libc++abi", 0, 0); syslog(LOG_CRIT, "%s", buffer); closelog(); -# else - // The good error reporting wasn't available in Android until L. Since we're - // about to abort anyway, just call __assert2, which will log _somewhere_ - // (tombstone and/or logcat) in older releases. - __assert2(__FILE__, __LINE__, __func__, buffer); -# endif // __ANDROID_API__ >= 21 #endif // __BIONIC__ abort(); From 6bc68d0fe94e7fbdec40e1306bf8db1b0db3110c Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Thu, 23 Jan 2025 04:27:26 -0500 Subject: [PATCH 109/208] [libc++] Remove support for Android 4 and older (#124062) --- .../__cxx03/__locale_dir/locale_base_api/android.h | 7 +------ .../include/__locale_dir/locale_base_api/android.h | 7 +------ libcxx/src/verbose_abort.cpp | 14 +------------- 3 files changed, 3 insertions(+), 25 deletions(-) diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h index 265dbf892a54f6..e96e64d9816474 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h @@ -18,9 +18,6 @@ extern "C" { } #include <__cxx03/android/api-level.h> -#if __ANDROID_API__ < 21 -# include <__cxx03/__support/xlocale/__posix_l_fallback.h> -#endif // If we do not have this header, we are in a platform build rather than an NDK // build, which will always be at least as new as the ToT NDK, in which case we @@ -30,9 +27,7 @@ extern "C" { // In NDK versions later than 16, locale-aware functions are provided by // legacy_stdlib_inlines.h # if __NDK_MAJOR__ <= 16 -# if __ANDROID_API__ < 21 -# include <__cxx03/__support/xlocale/__strtonum_fallback.h> -# elif __ANDROID_API__ < 26 +# if __ANDROID_API__ < 26 inline _LIBCPP_HIDE_FROM_ABI float strtof_l(const char* __nptr, char** __endptr, locale_t) { return ::strtof(__nptr, __endptr); diff --git a/libcxx/include/__locale_dir/locale_base_api/android.h b/libcxx/include/__locale_dir/locale_base_api/android.h index 08ef5407dedf4e..36b8d93e1b228c 100644 --- a/libcxx/include/__locale_dir/locale_base_api/android.h +++ b/libcxx/include/__locale_dir/locale_base_api/android.h @@ -18,9 +18,6 @@ extern "C" { } #include -#if __ANDROID_API__ < 21 -# include <__support/xlocale/__posix_l_fallback.h> -#endif // If we do not have this header, we are in a platform build rather than an NDK // build, which will always be at least as new as the ToT NDK, in which case we @@ -30,9 +27,7 @@ extern "C" { // In NDK versions later than 16, locale-aware functions are provided by // legacy_stdlib_inlines.h # if __NDK_MAJOR__ <= 16 -# if __ANDROID_API__ < 21 -# include <__support/xlocale/__strtonum_fallback.h> -# elif __ANDROID_API__ < 26 +# if __ANDROID_API__ < 26 inline _LIBCPP_HIDE_FROM_ABI float strtof_l(const char* __nptr, char** __endptr, locale_t) { return ::strtof(__nptr, __endptr); diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp index 6704709d247ca1..fd6bc4943d6bab 100644 --- a/libcxx/src/verbose_abort.cpp +++ b/libcxx/src/verbose_abort.cpp @@ -13,13 +13,8 @@ #include #ifdef __BIONIC__ -# include -# if __ANDROID_API__ >= 21 -# include +# include extern "C" void android_set_abort_message(const char* msg); -# else -# include -# endif // __ANDROID_API__ >= 21 #endif // __BIONIC__ #if defined(__APPLE__) && __has_include() @@ -54,7 +49,6 @@ _LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) _LIBCPP_VERBOS #elif defined(__BIONIC__) vasprintf(&buffer, format, list); -# if __ANDROID_API__ >= 21 // Show error in tombstone. android_set_abort_message(buffer); @@ -62,12 +56,6 @@ _LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) _LIBCPP_VERBOS openlog("libc++", 0, 0); syslog(LOG_CRIT, "%s", buffer); closelog(); -# else - // The good error reporting wasn't available in Android until L. Since we're - // about to abort anyway, just call __assert2, which will log _somewhere_ - // (tombstone and/or logcat) in older releases. - __assert2(__FILE__, __LINE__, __func__, buffer); -# endif // __ANDROID_API__ >= 21 #endif va_end(list); From 0429bfea49615882e89ee2350ffde777ce77fb95 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 23 Jan 2025 10:32:21 +0100 Subject: [PATCH 110/208] [libc++] Remove a few unused includes (#124025) --- libcxx/include/__type_traits/aligned_storage.h | 1 - libcxx/include/__type_traits/aligned_union.h | 1 - libcxx/include/__type_traits/common_reference.h | 1 - libcxx/include/__type_traits/datasizeof.h | 2 -- libcxx/include/__type_traits/is_always_bitcastable.h | 2 -- libcxx/include/__type_traits/make_signed.h | 1 - libcxx/include/__type_traits/make_unsigned.h | 1 - libcxx/include/__type_traits/remove_cvref.h | 2 -- libcxx/include/__type_traits/type_list.h | 1 - 9 files changed, 12 deletions(-) diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h index d98749980122ea..3c39a351e35010 100644 --- a/libcxx/include/__type_traits/aligned_storage.h +++ b/libcxx/include/__type_traits/aligned_storage.h @@ -12,7 +12,6 @@ #include <__config> #include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include <__type_traits/nat.h> #include <__type_traits/type_list.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__type_traits/aligned_union.h b/libcxx/include/__type_traits/aligned_union.h index de62a4b1c2a331..fa7d985b56dd3d 100644 --- a/libcxx/include/__type_traits/aligned_union.h +++ b/libcxx/include/__type_traits/aligned_union.h @@ -12,7 +12,6 @@ #include <__config> #include <__cstddef/size_t.h> #include <__type_traits/aligned_storage.h> -#include <__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/common_reference.h b/libcxx/include/__type_traits/common_reference.h index d436949e692fdb..0d35570da2622c 100644 --- a/libcxx/include/__type_traits/common_reference.h +++ b/libcxx/include/__type_traits/common_reference.h @@ -15,7 +15,6 @@ #include <__type_traits/copy_cvref.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_reference.h> -#include <__type_traits/remove_cv.h> #include <__type_traits/remove_cvref.h> #include <__type_traits/remove_reference.h> #include <__utility/declval.h> diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h index 0c1ed94f840294..54735cd52fdb59 100644 --- a/libcxx/include/__type_traits/datasizeof.h +++ b/libcxx/include/__type_traits/datasizeof.h @@ -11,8 +11,6 @@ #include <__config> #include <__cstddef/size_t.h> -#include <__type_traits/is_class.h> -#include <__type_traits/is_final.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/is_always_bitcastable.h b/libcxx/include/__type_traits/is_always_bitcastable.h index 4c6c43c6571fe7..044d2501669a8e 100644 --- a/libcxx/include/__type_traits/is_always_bitcastable.h +++ b/libcxx/include/__type_traits/is_always_bitcastable.h @@ -10,9 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_IS_ALWAYS_BITCASTABLE_H #include <__config> -#include <__type_traits/integral_constant.h> #include <__type_traits/is_integral.h> -#include <__type_traits/is_object.h> #include <__type_traits/is_same.h> #include <__type_traits/is_trivially_copyable.h> #include <__type_traits/remove_cv.h> diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h index 88513fea3006c7..42614a420f9fbf 100644 --- a/libcxx/include/__type_traits/make_signed.h +++ b/libcxx/include/__type_traits/make_signed.h @@ -13,7 +13,6 @@ #include <__type_traits/copy_cv.h> #include <__type_traits/is_enum.h> #include <__type_traits/is_integral.h> -#include <__type_traits/nat.h> #include <__type_traits/remove_cv.h> #include <__type_traits/type_list.h> diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h index 83ff8b7bb8014f..50928b03b0eb64 100644 --- a/libcxx/include/__type_traits/make_unsigned.h +++ b/libcxx/include/__type_traits/make_unsigned.h @@ -15,7 +15,6 @@ #include <__type_traits/is_enum.h> #include <__type_traits/is_integral.h> #include <__type_traits/is_unsigned.h> -#include <__type_traits/nat.h> #include <__type_traits/remove_cv.h> #include <__type_traits/type_list.h> diff --git a/libcxx/include/__type_traits/remove_cvref.h b/libcxx/include/__type_traits/remove_cvref.h index e3c65944e33c16..25ee853aaf2fc0 100644 --- a/libcxx/include/__type_traits/remove_cvref.h +++ b/libcxx/include/__type_traits/remove_cvref.h @@ -11,8 +11,6 @@ #include <__config> #include <__type_traits/is_same.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/type_list.h b/libcxx/include/__type_traits/type_list.h index 34d78fc97c9780..f20c384db5ca84 100644 --- a/libcxx/include/__type_traits/type_list.h +++ b/libcxx/include/__type_traits/type_list.h @@ -11,7 +11,6 @@ #include <__config> #include <__cstddef/size_t.h> -#include <__type_traits/enable_if.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header From ee99c4d4845db66c4daa2373352133f4b237c942 Mon Sep 17 00:00:00 2001 From: SivanShani-Arm Date: Thu, 23 Jan 2025 09:46:59 +0000 Subject: [PATCH 111/208] [LLVM][Clang][AArch64] Implement AArch64 build attributes (#123990) - Added support for AArch64-specific build attributes. - Print AArch64 build attributes to assembly. - Emit AArch64 build attributes to ELF. Specification: https://github.com/ARM-software/abi-aa/pull/230 --- llvm/include/llvm/BinaryFormat/ELF.h | 2 + llvm/include/llvm/MC/MCELFStreamer.h | 25 +- .../llvm/Support/AArch64BuildAttributes.h | 75 +++++ llvm/lib/MC/MCELFStreamer.cpp | 65 ++++- llvm/lib/Support/AArch64BuildAttributes.cpp | 117 ++++++++ llvm/lib/Support/CMakeLists.txt | 1 + llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 105 +++++-- .../AArch64/AsmParser/AArch64AsmParser.cpp | 271 +++++++++++++++++- .../MCTargetDesc/AArch64ELFStreamer.cpp | 148 +++++++++- .../MCTargetDesc/AArch64TargetStreamer.cpp | 104 +++++++ .../MCTargetDesc/AArch64TargetStreamer.h | 32 +++ .../AArch64/aarch64-build-attributes-all.ll | 21 ++ .../AArch64/aarch64-build-attributes-bti.ll | 19 ++ .../AArch64/aarch64-build-attributes-gcs.ll | 19 ++ .../AArch64/aarch64-build-attributes-pac.ll | 19 ++ .../aarch64-build-attributes-pauthabi.ll | 18 ++ .../aarch64-build-attributes-asm-all.s | 25 ++ .../aarch64-build-attributes-asm-bti.s | 18 ++ .../aarch64-build-attributes-asm-err-attrs.s | 70 +++++ ...aarch64-build-attributes-asm-err-headers.s | 61 ++++ .../aarch64-build-attributes-asm-gcs.s | 18 ++ .../aarch64-build-attributes-asm-none.s | 25 ++ ...ch64-build-attributes-asm-numerical-tags.s | 39 +++ ...arch64-build-attributes-asm-out-of-order.s | 48 ++++ .../aarch64-build-attributes-asm-pac.s | 18 ++ ...d-attributes-asm-private-subsections-err.s | 28 ++ ...build-attributes-asm-private-subsections.s | 51 ++++ .../gn/secondary/llvm/lib/Support/BUILD.gn | 2 + 28 files changed, 1422 insertions(+), 22 deletions(-) create mode 100644 llvm/include/llvm/Support/AArch64BuildAttributes.h create mode 100644 llvm/lib/Support/AArch64BuildAttributes.cpp create mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll create mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll create mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll create mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll create mode 100644 llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s create mode 100644 llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 1bc69f791bd84c..48ae0db80f43ee 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -1158,6 +1158,8 @@ enum : unsigned { SHT_ARM_ATTRIBUTES = 0x70000003U, SHT_ARM_DEBUGOVERLAY = 0x70000004U, SHT_ARM_OVERLAYSECTION = 0x70000005U, + // Support for AArch64 build attributes + SHT_AARCH64_ATTRIBUTES = 0x70000003U, // Special aarch64-specific section for MTE support, as described in: // https://github.com/ARM-software/abi-aa/blob/main/pauthabielf64/pauthabielf64.rst#section-types SHT_AARCH64_AUTH_RELR = 0x70000004U, diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h index 94d14088d0f5d2..5a1cdd9e96cad4 100644 --- a/llvm/include/llvm/MC/MCELFStreamer.h +++ b/llvm/include/llvm/MC/MCELFStreamer.h @@ -96,7 +96,7 @@ class MCELFStreamer : public MCObjectStreamer { // This structure holds all attributes, accounting for their string / // numeric value, so we can later emit them in declaration order, keeping // all in the same vector. - enum { + enum Types { HiddenAttribute = 0, NumericAttribute, TextAttribute, @@ -105,6 +105,17 @@ class MCELFStreamer : public MCObjectStreamer { unsigned Tag; unsigned IntValue; std::string StringValue; + AttributeItem(Types Ty, unsigned Tg, unsigned IV, std::string SV) + : Type(Ty), Tag(Tg), IntValue(IV), StringValue(SV) {} + }; + + /// ELF object attributes subsection support + struct AttributeSubSection { + bool IsActive; + StringRef VendorName; + unsigned IsOptional; + unsigned ParameterType; + SmallVector Content; }; // Attributes that are added and managed entirely by target. @@ -119,13 +130,23 @@ class MCELFStreamer : public MCObjectStreamer { unsigned Type, MCSection *&AttributeSection) { createAttributesSection(Vendor, Section, Type, AttributeSection, Contents); } + void + emitAttributesSection(MCSection *&AttributeSection, const Twine &Section, + unsigned Type, + SmallVector &SubSectionVec) { + createAttributesWithSubsection(AttributeSection, Section, Type, + SubSectionVec); + } private: AttributeItem *getAttributeItem(unsigned Attribute); - size_t calculateContentSize(SmallVector &AttrsVec); + size_t calculateContentSize(SmallVector &AttrsVec) const; void createAttributesSection(StringRef Vendor, const Twine &Section, unsigned Type, MCSection *&AttributeSection, SmallVector &AttrsVec); + void createAttributesWithSubsection( + MCSection *&AttributeSection, const Twine &Section, unsigned Type, + SmallVector &SubSectionVec); // GNU attributes that will get emitted at the end of the asm file. SmallVector GNUAttributes; diff --git a/llvm/include/llvm/Support/AArch64BuildAttributes.h b/llvm/include/llvm/Support/AArch64BuildAttributes.h new file mode 100644 index 00000000000000..ea293b72f9bb11 --- /dev/null +++ b/llvm/include/llvm/Support/AArch64BuildAttributes.h @@ -0,0 +1,75 @@ +//===-- AArch64BuildAttributes.h - AARch64 Build Attributes -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains enumerations and support routines for AArch64 build +// attributes as defined in Build Attributes for the AArch64 document. +// +// Build Attributes for the Arm® 64-bit Architecture (AArch64) 2024Q1 +// +// https://github.com/ARM-software/abi-aa/pull/230 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_AARCH64BUILDATTRIBUTES_H +#define LLVM_SUPPORT_AARCH64BUILDATTRIBUTES_H + +#include "llvm/ADT/StringRef.h" + +namespace llvm { + +namespace AArch64BuildAttributes { + +/// AArch64 build attributes vendors IDs (a.k.a subsection name) +enum VendorID : unsigned { + AEABI_FEATURE_AND_BITS = 0, + AEABI_PAUTHABI = 1, + VENDOR_UNKNOWN = 404 // Treated as a private subsection name +}; +StringRef getVendorName(unsigned const Vendor); +VendorID getVendorID(StringRef const Vendor); + +enum SubsectionOptional : unsigned { + REQUIRED = 0, + OPTIONAL = 1, + OPTIONAL_NOT_FOUND = 404 +}; +StringRef getOptionalStr(unsigned Optional); +SubsectionOptional getOptionalID(StringRef Optional); +StringRef getSubsectionOptionalUnknownError(); + +enum SubsectionType : unsigned { ULEB128 = 0, NTBS = 1, TYPE_NOT_FOUND = 404 }; +StringRef getTypeStr(unsigned Type); +SubsectionType getTypeID(StringRef Type); +StringRef getSubsectionTypeUnknownError(); + +enum PauthABITags : unsigned { + TAG_PAUTH_PLATFORM = 1, + TAG_PAUTH_SCHEMA = 2, + PAUTHABI_TAG_NOT_FOUND = 404 +}; +StringRef getPauthABITagsStr(unsigned PauthABITag); +PauthABITags getPauthABITagsID(StringRef PauthABITag); + +enum FeatureAndBitsTags : unsigned { + TAG_FEATURE_BTI = 0, + TAG_FEATURE_PAC = 1, + TAG_FEATURE_GCS = 2, + FEATURE_AND_BITS_TAG_NOT_FOUND = 404 +}; +StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag); +FeatureAndBitsTags getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag); + +enum FeatureAndBitsFlag : unsigned { + Feature_BTI_Flag = 1 << 0, + Feature_PAC_Flag = 1 << 1, + Feature_GCS_Flag = 1 << 2 +}; +} // namespace AArch64BuildAttributes +} // namespace llvm + +#endif // LLVM_SUPPORT_AARCH64BUILDATTRIBUTES_H \ No newline at end of file diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index 64ab2b2ab58f5b..282c82198507d7 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -696,8 +696,8 @@ MCELFStreamer::getAttributeItem(unsigned Attribute) { return nullptr; } -size_t -MCELFStreamer::calculateContentSize(SmallVector &AttrsVec) { +size_t MCELFStreamer::calculateContentSize( + SmallVector &AttrsVec) const { size_t Result = 0; for (const AttributeItem &Item : AttrsVec) { switch (Item.Type) { @@ -783,6 +783,67 @@ void MCELFStreamer::createAttributesSection( AttrsVec.clear(); } +void MCELFStreamer::createAttributesWithSubsection( + MCSection *&AttributeSection, const Twine &Section, unsigned Type, + SmallVector &SubSectionVec) { + // + // [ NTBS: vendor-name + // + // ]* + // vendor-data expends to: + // * + if (0 == SubSectionVec.size()) { + return; + } + + // Switch section to AttributeSection or get/create the section. + if (AttributeSection) { + switchSection(AttributeSection); + } else { + AttributeSection = getContext().getELFSection(Section, Type, 0); + switchSection(AttributeSection); + + // Format version + emitInt8(0x41); + } + + for (AttributeSubSection &SubSection : SubSectionVec) { + // subsection-length + vendor-name + '\0' + const size_t VendorHeaderSize = 4 + SubSection.VendorName.size() + 1; + // optional + parameter-type + const size_t VendorParameters = 1 + 1; + const size_t ContentsSize = calculateContentSize(SubSection.Content); + + emitInt32(VendorHeaderSize + VendorParameters + ContentsSize); + emitBytes(SubSection.VendorName); + emitInt8(0); // '\0' + emitInt8(SubSection.IsOptional); + emitInt8(SubSection.ParameterType); + + for (AttributeItem &Item : SubSection.Content) { + emitULEB128IntValue(Item.Tag); + switch (Item.Type) { + default: + assert(0 && "Invalid attribute type"); + break; + case AttributeItem::NumericAttribute: + emitULEB128IntValue(Item.IntValue); + break; + case AttributeItem::TextAttribute: + emitBytes(Item.StringValue); + emitInt8(0); // '\0' + break; + case AttributeItem::NumericAndTextAttributes: + emitULEB128IntValue(Item.IntValue); + emitBytes(Item.StringValue); + emitInt8(0); // '\0' + break; + } + } + } + SubSectionVec.clear(); +} + MCStreamer *llvm::createELFStreamer(MCContext &Context, std::unique_ptr &&MAB, std::unique_ptr &&OW, diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp new file mode 100644 index 00000000000000..ada34eb3f927d1 --- /dev/null +++ b/llvm/lib/Support/AArch64BuildAttributes.cpp @@ -0,0 +1,117 @@ +//===-- AArch64BuildAttributes.cpp - AArch64 Build Attributes -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/AArch64BuildAttributes.h" +#include "llvm/ADT/StringSwitch.h" + +namespace llvm { +namespace AArch64BuildAttributes { + +StringRef getVendorName(unsigned Vendor) { + switch (Vendor) { + case AEABI_FEATURE_AND_BITS: + return "aeabi_feature_and_bits"; + case AEABI_PAUTHABI: + return "aeabi_pauthabi"; + case VENDOR_UNKNOWN: + return ""; + default: + assert(0 && "Vendor name error"); + return ""; + } +} +VendorID getVendorID(StringRef Vendor) { + return StringSwitch(Vendor) + .Case("aeabi_feature_and_bits", AEABI_FEATURE_AND_BITS) + .Case("aeabi_pauthabi", AEABI_PAUTHABI) + .Default(VENDOR_UNKNOWN); +} + +StringRef getOptionalStr(unsigned Optional) { + switch (Optional) { + case REQUIRED: + return "required"; + case OPTIONAL: + return "optional"; + case OPTIONAL_NOT_FOUND: + default: + return ""; + } +} +SubsectionOptional getOptionalID(StringRef Optional) { + return StringSwitch(Optional) + .Case("required", REQUIRED) + .Case("optional", OPTIONAL) + .Default(OPTIONAL_NOT_FOUND); +} +StringRef getSubsectionOptionalUnknownError() { + return "unknown AArch64 build attributes optionality, expected " + "required|optional"; +} + +StringRef getTypeStr(unsigned Type) { + switch (Type) { + case ULEB128: + return "uleb128"; + case NTBS: + return "ntbs"; + case TYPE_NOT_FOUND: + default: + return ""; + } +} +SubsectionType getTypeID(StringRef Type) { + return StringSwitch(Type) + .Cases("uleb128", "ULEB128", ULEB128) + .Cases("ntbs", "NTBS", NTBS) + .Default(TYPE_NOT_FOUND); +} +StringRef getSubsectionTypeUnknownError() { + return "unknown AArch64 build attributes type, expected uleb128|ntbs"; +} + +StringRef getPauthABITagsStr(unsigned PauthABITag) { + switch (PauthABITag) { + case TAG_PAUTH_PLATFORM: + return "Tag_PAuth_Platform"; + case TAG_PAUTH_SCHEMA: + return "Tag_PAuth_Schema"; + case PAUTHABI_TAG_NOT_FOUND: + default: + return ""; + } +} +PauthABITags getPauthABITagsID(StringRef PauthABITag) { + return StringSwitch(PauthABITag) + .Case("Tag_PAuth_Platform", TAG_PAUTH_PLATFORM) + .Case("Tag_PAuth_Schema", TAG_PAUTH_SCHEMA) + .Default(PAUTHABI_TAG_NOT_FOUND); +} + +StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) { + switch (FeatureAndBitsTag) { + case TAG_FEATURE_BTI: + return "Tag_Feature_BTI"; + case TAG_FEATURE_PAC: + return "Tag_Feature_PAC"; + case TAG_FEATURE_GCS: + return "Tag_Feature_GCS"; + case FEATURE_AND_BITS_TAG_NOT_FOUND: + default: + return ""; + } +} +FeatureAndBitsTags getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) { + return StringSwitch(FeatureAndBitsTag) + .Case("Tag_Feature_BTI", TAG_FEATURE_BTI) + .Case("Tag_Feature_PAC", TAG_FEATURE_PAC) + .Case("Tag_Feature_GCS", TAG_FEATURE_GCS) + .Default(FEATURE_AND_BITS_TAG_NOT_FOUND); +} +} // namespace AArch64BuildAttributes +} // namespace llvm diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 2ecaea4b02bf61..122240c27b1fcd 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -144,6 +144,7 @@ add_llvm_component_library(LLVMSupport APInt.cpp APSInt.cpp ARMBuildAttrs.cpp + AArch64BuildAttributes.cpp ARMAttributeParser.cpp ARMWinEH.cpp Allocator.cpp diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 27e65d60122fd7..8d8520c68232be 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -208,6 +208,10 @@ class AArch64AsmPrinter : public AsmPrinter { /// pseudo instructions. bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst); + // Emit Build Attributes + void emitAttributes(unsigned Flags, uint64_t PAuthABIPlatform, + uint64_t PAuthABIVersion, AArch64TargetStreamer *TS); + void EmitToStreamer(MCStreamer &S, const MCInst &Inst); void EmitToStreamer(const MCInst &Inst) { EmitToStreamer(*OutStreamer, Inst); @@ -345,36 +349,53 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { if (!TT.isOSBinFormatELF()) return; - // Assemble feature flags that may require creation of a note section. - unsigned Flags = 0; + // For emitting build attributes and .note.gnu.property section + auto *TS = + static_cast(OutStreamer->getTargetStreamer()); + // Assemble feature flags that may require creation of build attributes and a + // note section. + unsigned BAFlags = 0; + unsigned GNUFlags = 0; if (const auto *BTE = mdconst::extract_or_null( - M.getModuleFlag("branch-target-enforcement"))) - if (!BTE->isZero()) - Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI; + M.getModuleFlag("branch-target-enforcement"))) { + if (!BTE->isZero()) { + BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_BTI_Flag; + GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI; + } + } if (const auto *GCS = mdconst::extract_or_null( - M.getModuleFlag("guarded-control-stack"))) - if (!GCS->isZero()) - Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_GCS; + M.getModuleFlag("guarded-control-stack"))) { + if (!GCS->isZero()) { + BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_GCS_Flag; + GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_GCS; + } + } if (const auto *Sign = mdconst::extract_or_null( - M.getModuleFlag("sign-return-address"))) - if (!Sign->isZero()) - Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; + M.getModuleFlag("sign-return-address"))) { + if (!Sign->isZero()) { + BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_PAC_Flag; + GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; + } + } uint64_t PAuthABIPlatform = -1; if (const auto *PAP = mdconst::extract_or_null( - M.getModuleFlag("aarch64-elf-pauthabi-platform"))) + M.getModuleFlag("aarch64-elf-pauthabi-platform"))) { PAuthABIPlatform = PAP->getZExtValue(); + } + uint64_t PAuthABIVersion = -1; if (const auto *PAV = mdconst::extract_or_null( - M.getModuleFlag("aarch64-elf-pauthabi-version"))) + M.getModuleFlag("aarch64-elf-pauthabi-version"))) { PAuthABIVersion = PAV->getZExtValue(); + } + // Emit AArch64 Build Attributes + emitAttributes(BAFlags, PAuthABIPlatform, PAuthABIVersion, TS); // Emit a .note.gnu.property section with the flags. - auto *TS = - static_cast(OutStreamer->getTargetStreamer()); - TS->emitNoteSection(Flags, PAuthABIPlatform, PAuthABIVersion); + TS->emitNoteSection(GNUFlags, PAuthABIPlatform, PAuthABIVersion); } void AArch64AsmPrinter::emitFunctionHeaderComment() { @@ -447,6 +468,58 @@ void AArch64AsmPrinter::emitSled(const MachineInstr &MI, SledKind Kind) { recordSled(CurSled, MI, Kind, 2); } +void AArch64AsmPrinter::emitAttributes(unsigned Flags, + uint64_t PAuthABIPlatform, + uint64_t PAuthABIVersion, + AArch64TargetStreamer *TS) { + + PAuthABIPlatform = (uint64_t(-1) == PAuthABIPlatform) ? 0 : PAuthABIPlatform; + PAuthABIVersion = (uint64_t(-1) == PAuthABIVersion) ? 0 : PAuthABIVersion; + + if (PAuthABIPlatform || PAuthABIVersion) { + TS->emitAtributesSubsection( + AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_PAUTHABI), + AArch64BuildAttributes::SubsectionOptional::REQUIRED, + AArch64BuildAttributes::SubsectionType::ULEB128); + TS->emitAttribute(AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_PAUTHABI), + AArch64BuildAttributes::TAG_PAUTH_PLATFORM, + PAuthABIPlatform, "", false); + TS->emitAttribute(AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_PAUTHABI), + AArch64BuildAttributes::TAG_PAUTH_SCHEMA, PAuthABIVersion, + "", false); + } + + unsigned BTIValue = + (Flags & AArch64BuildAttributes::Feature_BTI_Flag) ? 1 : 0; + unsigned PACValue = + (Flags & AArch64BuildAttributes::Feature_PAC_Flag) ? 1 : 0; + unsigned GCSValue = + (Flags & AArch64BuildAttributes::Feature_GCS_Flag) ? 1 : 0; + + if (BTIValue || PACValue || GCSValue) { + TS->emitAtributesSubsection( + AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), + AArch64BuildAttributes::SubsectionOptional::OPTIONAL, + AArch64BuildAttributes::SubsectionType::ULEB128); + TS->emitAttribute(AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), + AArch64BuildAttributes::TAG_FEATURE_BTI, BTIValue, "", + false); + TS->emitAttribute(AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), + AArch64BuildAttributes::TAG_FEATURE_PAC, PACValue, "", + false); + TS->emitAttribute(AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_FEATURE_AND_BITS), + AArch64BuildAttributes::TAG_FEATURE_GCS, GCSValue, "", + false); + } +} + // Emit the following code for Intrinsic::{xray_customevent,xray_typedevent} // (built-in functions __xray_customevent/__xray_typedevent). // diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 92f9f7309f8ec0..d3eda48f3276e9 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -42,7 +42,7 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" +#include "llvm/Support/AArch64BuildAttributes.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -228,6 +228,8 @@ class AArch64AsmParser : public MCTargetAsmParser { bool parseDirectiveSEHClearUnwoundToCall(SMLoc L); bool parseDirectiveSEHPACSignLR(SMLoc L); bool parseDirectiveSEHSaveAnyReg(SMLoc L, bool Paired, bool Writeback); + bool parseDirectiveAeabiSubSectionHeader(SMLoc L); + bool parseDirectiveAeabiAArch64Attr(SMLoc L); bool validateInstruction(MCInst &Inst, SMLoc &IDLoc, SmallVectorImpl &Loc); @@ -6992,6 +6994,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { const MCContext::Environment Format = getContext().getObjectFileType(); bool IsMachO = Format == MCContext::IsMachO; bool IsCOFF = Format == MCContext::IsCOFF; + bool IsELF = Format == MCContext::IsELF; auto IDVal = DirectiveID.getIdentifier().lower(); SMLoc Loc = DirectiveID.getLoc(); @@ -7087,6 +7090,13 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveSEHSaveAnyReg(Loc, true, true); else return true; + } else if (IsELF) { + if (IDVal == ".aeabi_subsection") + parseDirectiveAeabiSubSectionHeader(Loc); + else if (IDVal == ".aeabi_attribute") + parseDirectiveAeabiAArch64Attr(Loc); + else + return true; } else return true; return false; @@ -7823,6 +7833,265 @@ bool AArch64AsmParser::parseDirectiveSEHSaveAnyReg(SMLoc L, bool Paired, return false; } +bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) { + // Expecting 3 AsmToken::Identifier after '.aeabi_subsection', a name and 2 + // parameters, e.g.: .aeabi_subsection (1)aeabi_feature_and_bits, (2)optional, + // (3)uleb128 separated by 2 commas. + MCAsmParser &Parser = getParser(); + + // Consume the name (subsection name) + StringRef SubsectionName; + AArch64BuildAttributes::VendorID SubsectionNameID; + if (Parser.getTok().is(AsmToken::Identifier)) { + SubsectionName = Parser.getTok().getIdentifier(); + SubsectionNameID = AArch64BuildAttributes::getVendorID(SubsectionName); + } else { + Error(Parser.getTok().getLoc(), "subsection name not found"); + return true; + } + Parser.Lex(); + // consume a comma + // parseComma() return *false* on success, and call Lex(), no need to call + // Lex() again. + if (Parser.parseComma()) { + return true; + } + + std::unique_ptr SubsectionExists = + getTargetStreamer().getAtributesSubsectionByName(SubsectionName); + + // Consume the first parameter (optionality parameter) + AArch64BuildAttributes::SubsectionOptional IsOptional; + // options: optional/required + if (Parser.getTok().is(AsmToken::Identifier)) { + StringRef Optionality = Parser.getTok().getIdentifier(); + IsOptional = AArch64BuildAttributes::getOptionalID(Optionality); + if (AArch64BuildAttributes::OPTIONAL_NOT_FOUND == IsOptional) { + Error(Parser.getTok().getLoc(), + AArch64BuildAttributes::getSubsectionOptionalUnknownError() + ": " + + Optionality); + return true; + } + if (SubsectionExists) { + if (IsOptional != SubsectionExists->IsOptional) { + Error(Parser.getTok().getLoc(), + "optionality mismatch! subsection '" + SubsectionName + + "' already exists with optionality defined as '" + + AArch64BuildAttributes::getOptionalStr( + SubsectionExists->IsOptional) + + "' and not '" + + AArch64BuildAttributes::getOptionalStr(IsOptional) + "'"); + return true; + } + } + } else { + Error(Parser.getTok().getLoc(), + "optionality parameter not found, expected required|optional"); + return true; + } + // Check for possible IsOptional unaccepted values for known subsections + if (AArch64BuildAttributes::AEABI_FEATURE_AND_BITS == SubsectionNameID) { + if (AArch64BuildAttributes::REQUIRED == IsOptional) { + Error(Parser.getTok().getLoc(), + "aeabi_feature_and_bits must be marked as optional"); + return true; + } + } + if (AArch64BuildAttributes::AEABI_PAUTHABI == SubsectionNameID) { + if (AArch64BuildAttributes::OPTIONAL == IsOptional) { + Error(Parser.getTok().getLoc(), + "aeabi_pauthabi must be marked as required"); + return true; + } + } + Parser.Lex(); + // consume a comma + if (Parser.parseComma()) { + return true; + } + + // Consume the second parameter (type parameter) + AArch64BuildAttributes::SubsectionType Type; + if (Parser.getTok().is(AsmToken::Identifier)) { + StringRef Name = Parser.getTok().getIdentifier(); + Type = AArch64BuildAttributes::getTypeID(Name); + if (AArch64BuildAttributes::TYPE_NOT_FOUND == Type) { + Error(Parser.getTok().getLoc(), + AArch64BuildAttributes::getSubsectionTypeUnknownError() + ": " + + Name); + return true; + } + if (SubsectionExists) { + if (Type != SubsectionExists->ParameterType) { + Error(Parser.getTok().getLoc(), + "type mismatch! subsection '" + SubsectionName + + "' already exists with type defined as '" + + AArch64BuildAttributes::getTypeStr( + SubsectionExists->ParameterType) + + "' and not '" + AArch64BuildAttributes::getTypeStr(Type) + + "'"); + return true; + } + } + } else { + Error(Parser.getTok().getLoc(), + "type parameter not found, expected uleb128|ntbs"); + return true; + } + // Check for possible unaccepted 'type' values for known subsections + if (AArch64BuildAttributes::AEABI_FEATURE_AND_BITS == SubsectionNameID || + AArch64BuildAttributes::AEABI_PAUTHABI == SubsectionNameID) { + if (AArch64BuildAttributes::NTBS == Type) { + Error(Parser.getTok().getLoc(), + SubsectionName + " must be marked as ULEB128"); + return true; + } + } + Parser.Lex(); + // Parsing finished, check for trailing tokens. + if (Parser.getTok().isNot(llvm::AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected token for AArch64 build " + "attributes subsection header directive"); + return true; + } + + getTargetStreamer().emitAtributesSubsection(SubsectionName, IsOptional, Type); + + return false; +} + +bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) { + // Expecting 2 Tokens: after '.aeabi_attribute', e.g.: + // .aeabi_attribute (1)Tag_Feature_BTI, (2)[uleb128|ntbs] + // separated by a comma. + MCAsmParser &Parser = getParser(); + + std::unique_ptr ActiveSubsection = + getTargetStreamer().getActiveAtributesSubsection(); + if (nullptr == ActiveSubsection) { + Error(Parser.getTok().getLoc(), + "no active subsection, build attribute can not be added"); + return true; + } + StringRef ActiveSubsectionName = ActiveSubsection->VendorName; + unsigned ActiveSubsectionType = ActiveSubsection->ParameterType; + + unsigned ActiveSubsectionID = AArch64BuildAttributes::VENDOR_UNKNOWN; + if (AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_PAUTHABI) == ActiveSubsectionName) + ActiveSubsectionID = AArch64BuildAttributes::AEABI_PAUTHABI; + if (AArch64BuildAttributes::getVendorName( + AArch64BuildAttributes::AEABI_FEATURE_AND_BITS) == + ActiveSubsectionName) + ActiveSubsectionID = AArch64BuildAttributes::AEABI_FEATURE_AND_BITS; + + StringRef TagStr = ""; + unsigned Tag; + if (Parser.getTok().is(AsmToken::Identifier)) { + TagStr = Parser.getTok().getIdentifier(); + switch (ActiveSubsectionID) { + default: + assert(0 && "Subsection name error"); + break; + case AArch64BuildAttributes::VENDOR_UNKNOWN: + // Private subsection, accept any tag. + break; + case AArch64BuildAttributes::AEABI_PAUTHABI: + Tag = AArch64BuildAttributes::getPauthABITagsID(TagStr); + if (AArch64BuildAttributes::PAUTHABI_TAG_NOT_FOUND == Tag) { + Error(Parser.getTok().getLoc(), "unknown AArch64 build attribute '" + + TagStr + "' for subsection '" + + ActiveSubsectionName + "'"); + return true; + } + break; + case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS: + Tag = AArch64BuildAttributes::getFeatureAndBitsTagsID(TagStr); + if (AArch64BuildAttributes::FEATURE_AND_BITS_TAG_NOT_FOUND == Tag) { + Error(Parser.getTok().getLoc(), "unknown AArch64 build attribute '" + + TagStr + "' for subsection '" + + ActiveSubsectionName + "'"); + return true; + } + break; + } + } else if (Parser.getTok().is(AsmToken::Integer)) { + Tag = getTok().getIntVal(); + } else { + Error(Parser.getTok().getLoc(), "AArch64 build attributes tag not found"); + return true; + } + Parser.Lex(); + // consume a comma + // parseComma() return *false* on success, and call Lex(), no need to call + // Lex() again. + if (Parser.parseComma()) { + return true; + } + + // Consume the second parameter (attribute value) + unsigned ValueInt = unsigned(-1); + std::string ValueStr = ""; + if (Parser.getTok().is(AsmToken::Integer)) { + if (AArch64BuildAttributes::NTBS == ActiveSubsectionType) { + Error( + Parser.getTok().getLoc(), + "active subsection type is NTBS (string), found ULEB128 (unsigned)"); + return true; + } + ValueInt = getTok().getIntVal(); + } else if (Parser.getTok().is(AsmToken::Identifier)) { + if (AArch64BuildAttributes::ULEB128 == ActiveSubsectionType) { + Error( + Parser.getTok().getLoc(), + "active subsection type is ULEB128 (unsigned), found NTBS (string)"); + return true; + } + ValueStr = Parser.getTok().getIdentifier(); + } else if (Parser.getTok().is(AsmToken::String)) { + if (AArch64BuildAttributes::ULEB128 == ActiveSubsectionType) { + Error( + Parser.getTok().getLoc(), + "active subsection type is ULEB128 (unsigned), found NTBS (string)"); + return true; + } + ValueStr = Parser.getTok().getString(); + } else { + Error(Parser.getTok().getLoc(), "AArch64 build attributes value not found"); + return true; + } + // Check for possible unaccepted values for known tags (AEABI_PAUTHABI, + // AEABI_FEATURE_AND_BITS) + if (!(ActiveSubsectionID == AArch64BuildAttributes::VENDOR_UNKNOWN) && + TagStr != "") { // TagStr was a recognized string + if (0 != ValueInt && 1 != ValueInt) { + Error(Parser.getTok().getLoc(), + "unknown AArch64 build attributes Value for Tag '" + TagStr + + "' options are 0|1"); + return true; + } + } + Parser.Lex(); + // Parsing finished, check for trailing tokens. + if (Parser.getTok().isNot(llvm::AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), + "unexpected token for AArch64 build attributes tag and value " + "attribute directive"); + return true; + } + + if (unsigned(-1) != ValueInt) { + getTargetStreamer().emitAttribute(ActiveSubsectionName, Tag, ValueInt, "", + false); + } + + if ("" != ValueStr) { + getTargetStreamer().emitAttribute(ActiveSubsectionName, Tag, unsigned(-1), + ValueStr, false); + } + return false; +} + bool AArch64AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { // Try @AUTH expressions: they're more complex than the usual symbol variants. if (!parseAuthExpr(Res, EndLoc)) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 5bae846824548b..9f7a60074daeb9 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -33,6 +33,7 @@ #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCWinCOFFStreamer.h" +#include "llvm/Support/AArch64BuildAttributes.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" @@ -45,6 +46,7 @@ class AArch64ELFStreamer; class AArch64TargetAsmStreamer : public AArch64TargetStreamer { formatted_raw_ostream &OS; + std::string VendorTag; void emitInst(uint32_t Inst) override; @@ -148,13 +150,137 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer { OS << "\t.seh_save_any_reg_px\tq" << Reg << ", " << Offset << "\n"; } + void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value, + std::string String, bool Override) override { + + // AArch64 build attributes for assembly attribute form: + // .aeabi_attribute tag, value + if (unsigned(-1) == Value && "" == String) { + assert(0 && "Arguments error"); + return; + } + + unsigned VendorID = AArch64BuildAttributes::getVendorID(VendorName); + + switch (VendorID) { + default: + assert(0 && "Subsection name error"); + break; + case AArch64BuildAttributes::VENDOR_UNKNOWN: + if (unsigned(-1) != Value) { + OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value; + AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", + Override); + } + if ("" != String) { + OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << String; + AArch64TargetStreamer::emitAttribute(VendorName, Tag, unsigned(-1), + String, Override); + } + break; + // Note: AEABI_FEATURE_AND_BITS takes only unsigned values + case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS: + switch (Tag) { + default: // allow emitting any attribute by number + OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value; + // Keep the data structure consistent with the case of ELF emission + // (important for llvm-mc asm parsing) + AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", + Override); + break; + case AArch64BuildAttributes::TAG_FEATURE_BTI: + case AArch64BuildAttributes::TAG_FEATURE_GCS: + case AArch64BuildAttributes::TAG_FEATURE_PAC: + OS << "\t.aeabi_attribute" << "\t" + << AArch64BuildAttributes::getFeatureAndBitsTagsStr(Tag) << ", " + << Value; + AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", + Override); + break; + } + break; + // Note: AEABI_PAUTHABI takes only unsigned values + case AArch64BuildAttributes::AEABI_PAUTHABI: + switch (Tag) { + default: // allow emitting any attribute by number + OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value; + // Keep the data structure consistent with the case of ELF emission + // (important for llvm-mc asm parsing) + AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", + Override); + break; + case AArch64BuildAttributes::TAG_PAUTH_PLATFORM: + case AArch64BuildAttributes::TAG_PAUTH_SCHEMA: + OS << "\t.aeabi_attribute" << "\t" + << AArch64BuildAttributes::getPauthABITagsStr(Tag) << ", " << Value; + AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", + Override); + break; + } + break; + } + OS << "\n"; + } + + void emitAtributesSubsection( + StringRef SubsectionName, + AArch64BuildAttributes::SubsectionOptional Optional, + AArch64BuildAttributes::SubsectionType ParameterType) override { + // The AArch64 build attributes assembly subsection header format: + // ".aeabi_subsection name, optional, parameter type" + // optional: required (0) optional (1) + // parameter type: uleb128 or ULEB128 (0) ntbs or NTBS (1) + unsigned SubsectionID = AArch64BuildAttributes::getVendorID(SubsectionName); + + assert((0 == Optional || 1 == Optional) && + AArch64BuildAttributes::getSubsectionOptionalUnknownError().data()); + assert((0 == ParameterType || 1 == ParameterType) && + AArch64BuildAttributes::getSubsectionTypeUnknownError().data()); + + std::string SubsectionTag = ".aeabi_subsection"; + StringRef OptionalStr = getOptionalStr(Optional); + StringRef ParameterStr = getTypeStr(ParameterType); + + switch (SubsectionID) { + default: { + // Treated as a private subsection + break; + } + case AArch64BuildAttributes::AEABI_PAUTHABI: { + assert(AArch64BuildAttributes::REQUIRED == Optional && + "subsection .aeabi-pauthabi should be marked as " + "required and not as optional"); + assert(AArch64BuildAttributes::ULEB128 == ParameterType && + "subsection .aeabi-pauthabi should be " + "marked as uleb128 and not as ntbs"); + break; + } + case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS: { + assert(AArch64BuildAttributes::OPTIONAL == Optional && + "subsection .aeabi_feature_and_bits should be " + "marked as optional and not as required"); + assert(AArch64BuildAttributes::ULEB128 == ParameterType && + "subsection .aeabi_feature_and_bits should " + "be marked as uleb128 and not as ntbs"); + break; + } + } + OS << "\t" << SubsectionTag << "\t" << SubsectionName << ", " << OptionalStr + << ", " << ParameterStr; + // Keep the data structure consistent with the case of ELF emission + // (important for llvm-mc asm parsing) + AArch64TargetStreamer::emitAtributesSubsection(SubsectionName, Optional, + ParameterType); + OS << "\n"; + } + public: AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); }; AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) - : AArch64TargetStreamer(S), OS(OS) {} + : AArch64TargetStreamer(S), OS(OS) {} void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) { OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n"; @@ -294,6 +420,23 @@ AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { return static_cast(Streamer); } +void AArch64TargetELFStreamer::emitAtributesSubsection( + StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional, + AArch64BuildAttributes::SubsectionType ParameterType) { + AArch64TargetStreamer::emitAtributesSubsection(VendorName, IsOptional, + ParameterType); +} + +void AArch64TargetELFStreamer::emitAttribute(StringRef VendorName, unsigned Tag, + unsigned Value, std::string String, + bool Override) { + if (unsigned(-1) != Value) + AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", Override); + if ("" != String) + AArch64TargetStreamer::emitAttribute(VendorName, Tag, unsigned(-1), String, + Override); +} + void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { getStreamer().emitInst(Inst); } @@ -309,6 +452,9 @@ void AArch64TargetELFStreamer::finish() { MCContext &Ctx = S.getContext(); auto &Asm = S.getAssembler(); + S.emitAttributesSection(AttributeSection, ".ARM.attributes", + ELF::SHT_AARCH64_ATTRIBUTES, AttributeSubSections); + // If ImplicitMapSyms is specified, ensure that text sections end with // the A64 state while non-text sections end with the data state. When // sections are combined by the linker, the subsequent section will start with diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 7bd89c9e29a728..74ffe5f97f1b69 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -151,3 +151,107 @@ llvm::createAArch64ObjectTargetStreamer(MCStreamer &S, MCTargetStreamer *llvm::createAArch64NullTargetStreamer(MCStreamer &S) { return new AArch64TargetStreamer(S); } + +void AArch64TargetStreamer::emitAtributesSubsection( + StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional, + AArch64BuildAttributes::SubsectionType ParameterType) { + + // If exists, return. + for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { + if (VendorName == SubSection.VendorName) { + activateAtributesSubsection(VendorName); + return; + } + } + // else, add the subsection + MCELFStreamer::AttributeSubSection AttSubSection; + AttSubSection.VendorName = VendorName; + AttSubSection.IsOptional = IsOptional; + AttSubSection.ParameterType = ParameterType; + AttributeSubSections.push_back(AttSubSection); + activateAtributesSubsection(VendorName); +} + +std::unique_ptr +AArch64TargetStreamer::getActiveAtributesSubsection() { + for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { + if (SubSection.IsActive) { + return std::make_unique(SubSection); + } + } + return nullptr; +} + +std::unique_ptr +AArch64TargetStreamer::getAtributesSubsectionByName(StringRef Name) { + for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { + if (Name == SubSection.VendorName) { + return std::make_unique(SubSection); + } + } + return nullptr; +} + +void AArch64TargetStreamer::emitAttribute(StringRef VendorName, unsigned Tag, + unsigned Value, std::string String, + bool Override) { + + if (unsigned(-1) == Value && "" == String) { + assert(0 && "Arguments error"); + return; + } + if (AttributeSubSections.size() == 0) { + assert(0 && + "Can not add AArch64 build attribute: no AArch64 subsection exists"); + return; + } + + for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { + if (VendorName == SubSection.VendorName) { + if (!SubSection.IsActive) { + assert(0 && + "Can not add AArch64 build attribute: subsection is not active"); + return; + } + for (MCELFStreamer::AttributeItem &Item : SubSection.Content) { + if (Item.Tag == Tag) { + if (!Override) { + if ((unsigned(-1) != Value && Item.IntValue != Value) || + ("" != String && Item.StringValue != String)) { + assert(0 && + "Can not add AArch64 build attribute: An attribute with " + "the same tag and a different value already exists"); + return; + } else { + // Case Item.IntValue == Value, no need to emit twice + assert(0 && + "AArch64 build attribute: An attribute with the same tag " + "and a same value already exists"); + return; + } + } + } + } + if (unsigned(-1) != Value) + SubSection.Content.push_back(MCELFStreamer::AttributeItem( + MCELFStreamer::AttributeItem::NumericAttribute, Tag, Value, "")); + if ("" != String) + SubSection.Content.push_back(MCELFStreamer::AttributeItem( + MCELFStreamer::AttributeItem::TextAttribute, Tag, unsigned(-1), + String)); + return; + } + } + assert(0 && "Can not add AArch64 build attribute: required subsection does " + "not exist"); +} + +void AArch64TargetStreamer::activateAtributesSubsection(StringRef VendorName) { + for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) { + if (VendorName == SubSection.VendorName) { + SubSection.IsActive = true; + } else { + SubSection.IsActive = false; + } + } +} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index 1c0f5d848c00c6..b2b9afe8670738 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -10,7 +10,12 @@ #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H #include "AArch64MCExpr.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Instructions.h" +#include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Support/AArch64BuildAttributes.h" +#include namespace { class AArch64ELFStreamer; @@ -89,6 +94,24 @@ class AArch64TargetStreamer : public MCTargetStreamer { virtual void emitARM64WinCFISaveAnyRegQX(unsigned Reg, int Offset) {} virtual void emitARM64WinCFISaveAnyRegQPX(unsigned Reg, int Offset) {} + /// Build attributes implementation + virtual void + emitAtributesSubsection(StringRef VendorName, + AArch64BuildAttributes::SubsectionOptional IsOptional, + AArch64BuildAttributes::SubsectionType ParameterType); + virtual void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value, + std::string String, bool Override); + void activateAtributesSubsection(StringRef VendorName); + std::unique_ptr + getActiveAtributesSubsection(); + std::unique_ptr + getAtributesSubsectionByName(StringRef Name); + void + insertAttributeInPlace(const MCELFStreamer::AttributeItem &Attr, + MCELFStreamer::AttributeSubSection &AttSubSection); + + SmallVector AttributeSubSections; + private: std::unique_ptr ConstantPools; }; @@ -97,6 +120,15 @@ class AArch64TargetELFStreamer : public AArch64TargetStreamer { private: AArch64ELFStreamer &getStreamer(); + MCSection *AttributeSection = nullptr; + + /// Build attributes implementation + void emitAtributesSubsection( + StringRef VendorName, + AArch64BuildAttributes::SubsectionOptional IsOptional, + AArch64BuildAttributes::SubsectionType ParameterType) override; + void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value, + std::string String, bool Override = false) override; void emitInst(uint32_t Inst) override; void emitDirectiveVariantPCS(MCSymbol *Symbol) override; void finish() override; diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll new file mode 100644 index 00000000000000..aecc74b2ce46dd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll @@ -0,0 +1,21 @@ +; RUN: llc %s -o - | FileCheck %s --check-prefix=ASM +; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +; ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 +; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 1 +; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 1 + +; ELF: Hex dump of section '.ARM.attributes': +; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu +; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000001 re_and_bits..... +; ELF-NEXT: 0x00000020 01010201 + + +target triple = "aarch64-unknown-none-elf" + +!llvm.module.flags = !{!1, !2, !3} + +!1 = !{i32 8, !"branch-target-enforcement", i32 1} +!2 = !{i32 8, !"guarded-control-stack", i32 1} +!3 = !{i32 8, !"sign-return-address", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll new file mode 100644 index 00000000000000..8ec78df13be28b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s | FileCheck %s --check-prefix=ASM +; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +; ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 +; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 0 +; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 0 + +; ELF: Hex dump of section '.ARM.attributes': +; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu +; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000001 re_and_bits..... +; ELF-NEXT: 0x00000020 01000200 + + +target triple = "aarch64-unknown-none-elf" + +!llvm.module.flags = !{!1} + +!1 = !{i32 8, !"branch-target-enforcement", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll new file mode 100644 index 00000000000000..be528779e82280 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s | FileCheck %s --check-prefix=ASM +; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +; ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 0 +; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 0 +; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 1 + +; ELF: Hex dump of section '.ARM.attributes': +; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu +; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... +; ELF-NEXT: 0x00000020 01000201 + + +target triple = "aarch64-unknown-none-elf" + +!llvm.module.flags = !{!1} + +!1 = !{i32 8, !"guarded-control-stack", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll new file mode 100644 index 00000000000000..e3e5933105426a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s | FileCheck %s --check-prefix=ASM +; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +; ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +; ASM-NEXT: .aeabi_attribute Tag_Feature_BTI, 0 +; ASM-NEXT: .aeabi_attribute Tag_Feature_PAC, 1 +; ASM-NEXT: .aeabi_attribute Tag_Feature_GCS, 0 + +; ELF: Hex dump of section '.ARM.attributes': +; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu +; ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... +; ELF-NEXT: 0x00000020 01010200 + + +target triple = "aarch64-unknown-none-elf" + +!llvm.module.flags = !{!1} + +!1 = !{i32 8, !"sign-return-address", i32 1} diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll new file mode 100644 index 00000000000000..35ad514c943a5a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s | FileCheck %s --check-prefix=ASM +; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +; ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +; ASM-NEXT: .aeabi_attribute Tag_PAuth_Platform, 2 +; ASM-NEXT: .aeabi_attribute Tag_PAuth_Schema, 31 + +; ELF: Hex dump of section '.ARM.attributes': +; ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth +; ELF-NEXT: 0x00000010 61626900 00000102 021f + + +target triple = "aarch64-unknown-none-elf" + +!llvm.module.flags = !{!1, !2} + +!1 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2} +!2 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31} diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s new file mode 100644 index 00000000000000..acbd0101e13fa3 --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s @@ -0,0 +1,25 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_attribute Tag_PAuth_Platform, 1 +// ASM: .aeabi_attribute Tag_PAuth_Schema, 1 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_BTI, 1 +// ASM: .aeabi_attribute Tag_Feature_PAC, 1 +// ASM: .aeabi_attribute Tag_Feature_GCS, 1 + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth +// ELF-NEXT: 0x00000010 61626900 00000101 02012300 00006165 abi.......#...ae +// ELF-NEXT: 0x00000020 6162695f 66656174 7572655f 616e645f abi_feature_and_ +// ELF-NEXT: 0x00000030 62697473 00010000 01010102 01 + + +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute Tag_PAuth_Platform, 1 +.aeabi_attribute Tag_PAuth_Schema, 1 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_BTI, 1 +.aeabi_attribute Tag_Feature_PAC, 1 +.aeabi_attribute Tag_Feature_GCS, 1 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s new file mode 100644 index 00000000000000..3897fee99c3eea --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s @@ -0,0 +1,18 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_BTI, 1 +// ASM: .aeabi_attribute Tag_Feature_PAC, 0 +// ASM: .aeabi_attribute Tag_Feature_GCS, 0 + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu +// ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000001 re_and_bits..... +// ELF-NEXT: 0x00000020 01000200 + + +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_BTI, 1 +.aeabi_attribute Tag_Feature_PAC, 0 +.aeabi_attribute Tag_Feature_GCS, 0 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s new file mode 100644 index 00000000000000..ddf8feb9428d2d --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s @@ -0,0 +1,70 @@ +// RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s + +.aeabi_attribute Tag_Feature_BTI, 1 +// ERR: error: no active subsection, build attribute can not be added +// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 + +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute Tag_Feature_BTI, 1 +// ERR: error: unknown AArch64 build attribute 'Tag_Feature_BTI' for subsection 'aeabi_pauthabi' +// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, 1 + +.aeabi_attribute Tag_PAuth_Platform, 4 +// ERR: error: unknown AArch64 build attributes Value for Tag 'Tag_PAuth_Platform' options are 0|1 +// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, 4 + +.aeabi_attribute a, 1 +// ERR: error: unknown AArch64 build attribute 'a' for subsection 'aeabi_pauthabi' +// ERR-NEXT: .aeabi_attribute a, 1 + +.aeabi_attribute Tag_PAuth_Platform, Tag_PAuth_Platform +// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) +// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, Tag_PAuth_Platform + +.aeabi_attribute Tag_PAuth_Platform, a +// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) +// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, a + +.aeabi_attribute Tag_PAuth_Platform, +// ERR: error: AArch64 build attributes value not found +// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, + +.aeabi_attribute Tag_PAuth_Platform +// ERR: error: expected comma +// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform + +.aeabi_attribute +// ERR: error: AArch64 build attributes tag not found +// ERR-NEXT: .aeabi_attribute + +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_PAuth_Platform, 1 +// ERR: unknown AArch64 build attribute 'Tag_PAuth_Platform' for subsection 'aeabi_feature_and_bits' + +.aeabi_attribute a, 1 +// ERR: error: unknown AArch64 build attribute 'a' for subsection 'aeabi_feature_and_bits' + +.aeabi_attribute Tag_Feature_BTI, Tag_Feature_BTI +// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) +// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, Tag_Feature_BTI + +.aeabi_attribute Tag_Feature_BTI, a +// ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string) +// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, a + +.aeabi_attribute Tag_Feature_BTI, +// ERR: error: AArch64 build attributes value not found +// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, + +.aeabi_attribute Tag_Feature_BTI +// ERR: error: expected comma +// ERR-NEXT: .aeabi_attribute Tag_Feature_BTI + +.aeabi_attribute +// ERR: error: AArch64 build attributes tag not found +// ERR-NEXT: .aeabi_attribute + +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute Tag_PAuth_Platform, 1 some_text +// ERR: error: unexpected token for AArch64 build attributes tag and value attribute directive +// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, 1 some_text diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s new file mode 100644 index 00000000000000..9e6dca341e9f86 --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s @@ -0,0 +1,61 @@ +// RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s + +.aeabi_subsection aeabi_pauthabi, optional, uleb128 +// ERR: error: aeabi_pauthabi must be marked as required +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, optional, uleb128 + +.aeabi_subsection aeabi_pauthabi, required, ntbs +// ERR: error: aeabi_pauthabi must be marked as ULEB128 +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, ntbs + +.aeabi_subsection aeabi_feature_and_bits, required, uleb128 +// ERR: error: aeabi_feature_and_bits must be marked as optional +// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, required, uleb128 + +.aeabi_subsection aeabi_feature_and_bits, optional, ntbs +// ERR: error: aeabi_feature_and_bits must be marked as ULEB128 +// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, ntbs + +.aeabi_subsection 1, required, uleb128 +// ERR: error: subsection name not found +// ERR-NEXT: .aeabi_subsection 1, required, uleb128 + +.aeabi_subsection , required, uleb128 +// ERR: error: subsection name not found +// ERR-NEXT: .aeabi_subsection , required, uleb128 + +.aeabi_subsection aeabi_pauthabi, a, uleb128 +// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: a +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, a, uleb128 + +.aeabi_subsection aeabi_pauthabi, a, uleb128 +// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: a +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, a, uleb128 + +.aeabi_subsection aeabi_pauthabi, 1, uleb128 +// ERR: error: optionality parameter not found, expected required|optional +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, 1, uleb128 + +.aeabi_subsection aeabi_pauthabi, ,uleb128 +// ERR: error: optionality parameter not found, expected required|optional +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, ,uleb128 + +.aeabi_subsection aeabi_pauthabi,uleb128 +// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: uleb128 +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi,uleb128 + +.aeabi_subsection aeabi_pauthabi uleb128 +// ERR: expected comma +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi uleb128 + +.aeabi_subsection aeabi_pauthabi, required +// ERR: error: expected comma +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required + +.aeabi_subsection aeabi_pauthabi, required, +// ERR: error: type parameter not found, expected uleb128|ntbs +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, + +.aeabi_subsection aeabi_pauthabi, required, a +// ERR: error: unknown AArch64 build attributes type, expected uleb128|ntbs: a +// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, a diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s new file mode 100644 index 00000000000000..5cb7e6835e5c1c --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s @@ -0,0 +1,18 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_BTI, 0 +// ASM: .aeabi_attribute Tag_Feature_PAC, 0 +// ASM: .aeabi_attribute Tag_Feature_GCS, 1 + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu +// ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... +// ELF-NEXT: 0x00000020 01000201 + + +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_BTI, 0 +.aeabi_attribute Tag_Feature_PAC, 0 +.aeabi_attribute Tag_Feature_GCS, 1 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s new file mode 100644 index 00000000000000..a3cbbe270dffeb --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s @@ -0,0 +1,25 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_attribute Tag_PAuth_Platform, 0 +// ASM: .aeabi_attribute Tag_PAuth_Schema, 0 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_BTI, 0 +// ASM: .aeabi_attribute Tag_Feature_PAC, 0 +// ASM: .aeabi_attribute Tag_Feature_GCS, 0 + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth +// ELF-NEXT: 0x00000010 61626900 00000100 02002300 00006165 abi.......#...ae +// ELF-NEXT: 0x00000020 6162695f 66656174 7572655f 616e645f abi_feature_and_ +// ELF-NEXT: 0x00000030 62697473 00010000 00010002 00 + + +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute Tag_PAuth_Platform, 0 +.aeabi_attribute Tag_PAuth_Schema, 0 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_BTI, 0 +.aeabi_attribute Tag_Feature_PAC, 0 +.aeabi_attribute Tag_Feature_GCS, 0 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s new file mode 100644 index 00000000000000..047939d2efd6ca --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s @@ -0,0 +1,39 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM + +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_attribute 0, 1 +// ASM: .aeabi_attribute Tag_PAuth_Platform, 1 +// ASM: .aeabi_attribute Tag_PAuth_Schema, 1 +// ASM: .aeabi_attribute 3, 1 +// ASM: .aeabi_attribute 4, 1 +// ASM: .aeabi_attribute 5, 1 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_BTI, 1 +// ASM: .aeabi_attribute Tag_Feature_PAC, 1 +// ASM: .aeabi_attribute Tag_Feature_GCS, 1 +// ASM: .aeabi_attribute 3, 1 +// ASM: .aeabi_attribute 4, 1 +// ASM: .aeabi_attribute 5, 1 + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 41210000 00616561 62695f70 61757468 A!...aeabi_pauth +// ELF-NEXT: 0x00000010 61626900 00000001 01010201 03010401 abi............. +// ELF-NEXT: 0x00000020 05012900 00006165 6162695f 66656174 ..)...aeabi_feat +// ELF-NEXT: 0x00000030 7572655f 616e645f 62697473 00010000 ure_and_bits.... +// ELF-NEXT: 0x00000040 01010102 01030104 010501 + + +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute 0, 1 +.aeabi_attribute 1, 1 +.aeabi_attribute 2, 1 +.aeabi_attribute 3, 1 +.aeabi_attribute 4, 1 +.aeabi_attribute 5, 1 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute 0, 1 +.aeabi_attribute 1, 1 +.aeabi_attribute 2, 1 +.aeabi_attribute 3, 1 +.aeabi_attribute 4, 1 +.aeabi_attribute 5, 1 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s new file mode 100644 index 00000000000000..2d5d42561aa6f6 --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s @@ -0,0 +1,48 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_BTI, 1 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_attribute Tag_PAuth_Schema, 1 +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_attribute Tag_PAuth_Platform, 1 +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_GCS, 1 +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_PAC, 0 +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute 7, 1 +// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128 +// ASM: .aeabi_attribute 7, 0 + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 411b0000 00616561 62695f70 61757468 A....aeabi_pauth +// ELF-NEXT: 0x00000010 61626900 00000201 01010700 25000000 abi.........%... +// ELF-NEXT: 0x00000020 61656162 695f6665 61747572 655f616e aeabi_feature_an +// ELF-NEXT: 0x00000030 645f6269 74730001 00000102 01010007 d_bits.......... +// ELF-NEXT: 0x00000040 01 + + +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_BTI, 1 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute Tag_PAuth_Schema, 1 +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute Tag_PAuth_Platform, 1 +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_GCS, 1 +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_PAC, 0 +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute 7, 1 +.aeabi_subsection aeabi_pauthabi, required, uleb128 +.aeabi_attribute 7, 0 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s new file mode 100644 index 00000000000000..e3191acf31141e --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s @@ -0,0 +1,18 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +// ASM: .aeabi_attribute Tag_Feature_BTI, 0 +// ASM: .aeabi_attribute Tag_Feature_PAC, 1 +// ASM: .aeabi_attribute Tag_Feature_GCS, 0 + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu +// ELF-NEXT: 0x00000010 72655f61 6e645f62 69747300 01000000 re_and_bits..... +// ELF-NEXT: 0x00000020 01010200 + + +.aeabi_subsection aeabi_feature_and_bits, optional, uleb128 +.aeabi_attribute Tag_Feature_BTI, 0 +.aeabi_attribute Tag_Feature_PAC, 1 +.aeabi_attribute Tag_Feature_GCS, 0 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s new file mode 100644 index 00000000000000..5884a74f989cc1 --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s @@ -0,0 +1,28 @@ +// RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s + +.aeabi_subsection private_subsection, optional, uleb128 + +.aeabi_subsection private_subsection, required, uleb128 +// ERR: error: optionality mismatch! subsection 'private_subsection' already exists with optionality defined as 'optional' and not 'required' +// ERR-NEXT: .aeabi_subsection private_subsection, required, uleb128 + +.aeabi_subsection private_subsection, optional, ntbs +// ERR: error: type mismatch! subsection 'private_subsection' already exists with type defined as 'uleb128' and not 'ntbs' +// ERR-NEXT: .aeabi_subsection private_subsection, optional, ntbs + +.aeabi_subsection private_subsection_1, optional, ntbs +.aeabi_attribute 324, 1 +// ERR: error: active subsection type is NTBS (string), found ULEB128 (unsigned) +// ERR-NEXT: .aeabi_attribute 324, 1 + +.aeabi_subsection foo, optional, uleb128 +.aeabi_subsection bar, optional, uleb128 +.aeabi_subsection foo, required, uleb128 +// ERR: error: optionality mismatch! subsection 'foo' already exists with optionality defined as 'optional' and not 'required' +// ERR-NEXT: .aeabi_subsection foo, required, uleb128 + +.aeabi_subsection goo, optional, ntbs +.aeabi_subsection zar, optional, ntbs +.aeabi_subsection goo, optional, uleb128 +// ERR: error: type mismatch! subsection 'goo' already exists with type defined as 'ntbs' and not 'uleb128' +// ERR-NEXT: .aeabi_subsection goo, optional, uleb128 diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s new file mode 100644 index 00000000000000..229033a9f6b70d --- /dev/null +++ b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s @@ -0,0 +1,51 @@ +// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF + +// ASM: .aeabi_subsection private_subsection_1, optional, uleb128 +// ASM: .aeabi_attribute 12, 257 +// ASM: .aeabi_subsection private_subsection_2, required, uleb128 +// ASM: .aeabi_attribute 76, 257 +// ASM: .aeabi_subsection private_subsection_3, optional, ntbs +// ASM: .aeabi_attribute 34, hello_llvm +// ASM: .aeabi_subsection private_subsection_4, required, ntbs +// ASM: .aeabi_attribute 777, "hello_llvm" +// ASM: .aeabi_subsection private_subsection_1, optional, uleb128 +// ASM: .aeabi_attribute 876, 257 +// ASM: .aeabi_subsection private_subsection_2, required, uleb128 +// ASM: .aeabi_attribute 876, 257 +// ASM: .aeabi_subsection private_subsection_3, optional, ntbs +// ASM: .aeabi_attribute 876, "hello_llvm" +// ASM: .aeabi_subsection private_subsection_4, required, ntbs +// ASM: .aeabi_attribute 876, hello_llvm + +// ELF: Hex dump of section '.ARM.attributes': +// ELF-NEXT: 0x00000000 41220000 00707269 76617465 5f737562 A"...private_sub +// ELF-NEXT: 0x00000010 73656374 696f6e5f 31000100 0c8102ec section_1....... +// ELF-NEXT: 0x00000020 06810222 00000070 72697661 74655f73 ..."...private_s +// ELF-NEXT: 0x00000030 75627365 6374696f 6e5f3200 00004c81 ubsection_2...L. +// ELF-NEXT: 0x00000040 02ec0681 02360000 00707269 76617465 .....6...private +// ELF-NEXT: 0x00000050 5f737562 73656374 696f6e5f 33000101 _subsection_3... +// ELF-NEXT: 0x00000060 2268656c 6c6f5f6c 6c766d00 ec062268 "hello_llvm..."h +// ELF-NEXT: 0x00000070 656c6c6f 5f6c6c76 6d220037 00000070 ello_llvm".7...p +// ELF-NEXT: 0x00000080 72697661 74655f73 75627365 6374696f rivate_subsectio +// ELF-NEXT: 0x00000090 6e5f3400 00018906 2268656c 6c6f5f6c n_4....."hello_l +// ELF-NEXT: 0x000000a0 6c766d22 00ec0668 656c6c6f 5f6c6c76 lvm"...hello_llv +// ELF-NEXT: 0x000000b0 6d00 m. + + +.aeabi_subsection private_subsection_1, optional, uleb128 +.aeabi_attribute 12, 257 +.aeabi_subsection private_subsection_2, required, uleb128 +.aeabi_attribute 76, 257 +.aeabi_subsection private_subsection_3, optional, ntbs +.aeabi_attribute 34, hello_llvm +.aeabi_subsection private_subsection_4, required, ntbs +.aeabi_attribute 777, "hello_llvm" +.aeabi_subsection private_subsection_1, optional, uleb128 +.aeabi_attribute 876, 257 +.aeabi_subsection private_subsection_2, required, uleb128 +.aeabi_attribute 876, 257 +.aeabi_subsection private_subsection_3, optional, ntbs +.aeabi_attribute 876, "hello_llvm" +.aeabi_subsection private_subsection_4, required, ntbs +.aeabi_attribute 876, hello_llvm diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index d152aec19d1b58..008715a0b3dea5 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -33,6 +33,7 @@ static_library("Support") { "Windows", ] sources = [ + "AArch64BuildAttributes.cpp" "ABIBreak.cpp", "AMDGPUMetadata.cpp", "APFixedPoint.cpp", @@ -41,6 +42,7 @@ static_library("Support") { "APSInt.cpp", "ARMAttributeParser.cpp", "ARMBuildAttrs.cpp", + "AArch64BuildAttributes.cpp", "ARMWinEH.cpp", "Allocator.cpp", "AutoConvert.cpp", From 7fb97bee9269f0d4239908ac8def70be696991c6 Mon Sep 17 00:00:00 2001 From: Stephen Senran Zhang Date: Thu, 23 Jan 2025 18:00:31 +0800 Subject: [PATCH 112/208] [ConstraintElimination] Add eq/ne facts to signed constraint system (#121423) Facts of eq/ne were added to unsigned system only, causing some missing optimizations. This patch adds eq/ne facts to both signed & unsigned constraint system. Fixes #117961. --- .../Scalar/ConstraintElimination.cpp | 37 +++++++++++--- .../Transforms/ConstraintElimination/eq.ll | 50 +++++++++++++++++++ .../Transforms/ConstraintElimination/ne.ll | 6 +-- .../ConstraintElimination/pr105785.ll | 3 +- 4 files changed, 83 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 91a3c3f0d392a1..fec5036f8f5a22 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -313,7 +313,8 @@ class ConstraintInfo { /// New variables that need to be added to the system are collected in /// \p NewVariables. ConstraintTy getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, - SmallVectorImpl &NewVariables) const; + SmallVectorImpl &NewVariables, + bool ForceSignedSystem = false) const; /// Turns a comparison of the form \p Op0 \p Pred \p Op1 into a vector of /// constraints using getConstraint. Returns an empty constraint if the result @@ -330,6 +331,14 @@ class ConstraintInfo { void transferToOtherSystem(CmpInst::Predicate Pred, Value *A, Value *B, unsigned NumIn, unsigned NumOut, SmallVectorImpl &DFSInStack); + +private: + /// Adds facts into constraint system. \p ForceSignedSystem can be set when + /// the \p Pred is eq/ne, and signed constraint system is used when it's + /// specified. + void addFactImpl(CmpInst::Predicate Pred, Value *A, Value *B, unsigned NumIn, + unsigned NumOut, SmallVectorImpl &DFSInStack, + bool ForceSignedSystem); }; /// Represents a (Coefficient * Variable) entry after IR decomposition. @@ -636,8 +645,12 @@ static Decomposition decompose(Value *V, ConstraintTy ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, - SmallVectorImpl &NewVariables) const { + SmallVectorImpl &NewVariables, + bool ForceSignedSystem) const { assert(NewVariables.empty() && "NewVariables must be empty when passed in"); + assert((!ForceSignedSystem || CmpInst::isEquality(Pred)) && + "signed system can only be forced on eq/ne"); + bool IsEq = false; bool IsNe = false; @@ -652,7 +665,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, break; } case CmpInst::ICMP_EQ: - if (match(Op1, m_Zero())) { + if (!ForceSignedSystem && match(Op1, m_Zero())) { Pred = CmpInst::ICMP_ULE; } else { IsEq = true; @@ -660,7 +673,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, } break; case CmpInst::ICMP_NE: - if (match(Op1, m_Zero())) { + if (!ForceSignedSystem && match(Op1, m_Zero())) { Pred = CmpInst::getSwappedPredicate(CmpInst::ICMP_UGT); std::swap(Op0, Op1); } else { @@ -677,7 +690,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, return {}; SmallVector Preconditions; - bool IsSigned = CmpInst::isSigned(Pred); + bool IsSigned = ForceSignedSystem || CmpInst::isSigned(Pred); auto &Value2Index = getValue2Index(IsSigned); auto ADec = decompose(Op0->stripPointerCastsSameRepresentation(), Preconditions, IsSigned, DL); @@ -737,7 +750,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, int64_t OffsetSum; if (AddOverflow(Offset1, Offset2, OffsetSum)) return {}; - if (Pred == (IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT)) + if (Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) if (AddOverflow(OffsetSum, int64_t(-1), OffsetSum)) return {}; R[0] = OffsetSum; @@ -1580,10 +1593,20 @@ static bool checkOrAndOpImpliedByOther( void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B, unsigned NumIn, unsigned NumOut, SmallVectorImpl &DFSInStack) { + addFactImpl(Pred, A, B, NumIn, NumOut, DFSInStack, false); + // If the Pred is eq/ne, also add the fact to signed system. + if (CmpInst::isEquality(Pred)) + addFactImpl(Pred, A, B, NumIn, NumOut, DFSInStack, true); +} + +void ConstraintInfo::addFactImpl(CmpInst::Predicate Pred, Value *A, Value *B, + unsigned NumIn, unsigned NumOut, + SmallVectorImpl &DFSInStack, + bool ForceSignedSystem) { // If the constraint has a pre-condition, skip the constraint if it does not // hold. SmallVector NewVariables; - auto R = getConstraint(Pred, A, B, NewVariables); + auto R = getConstraint(Pred, A, B, NewVariables, ForceSignedSystem); // TODO: Support non-equality for facts as well. if (!R.isValid(*this) || R.isNe()) diff --git a/llvm/test/Transforms/ConstraintElimination/eq.ll b/llvm/test/Transforms/ConstraintElimination/eq.ll index a9e4dffdcebb0d..04cd39490cdef3 100644 --- a/llvm/test/Transforms/ConstraintElimination/eq.ll +++ b/llvm/test/Transforms/ConstraintElimination/eq.ll @@ -424,3 +424,53 @@ bc_equal: not_eq: ret i1 false } + +define i1 @test_eq_for_signed_cmp(i32 noundef %v0, i32 noundef %v1, i32 noundef %v2) { +; CHECK-LABEL: @test_eq_for_signed_cmp( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2:%.*]], [[V0:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i32 [[V0]], [[V1:%.*]] +; CHECK-NEXT: [[AND0:%.*]] = and i1 [[CMP1]], [[CMP]] +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[V1]], [[V2]] +; CHECK-NEXT: [[AND1:%.*]] = and i1 false, [[AND0]] +; CHECK-NEXT: ret i1 [[AND1]] +; +entry: + %cmp = icmp eq i32 %v2, %v0 + %cmp1 = icmp sge i32 %v0, %v1 + %and0 = and i1 %cmp1, %cmp + %cmp4 = icmp sgt i32 %v1, %v2 + %and1 = and i1 %cmp4, %and0 + ret i1 %and1 +} + +define i1 @test_eq_for_signed_cmp_with_decompsition(i32 noundef %v0, i32 noundef %v1, i32 noundef %v2, i32 noundef %addend0, i32 noundef %addend1) { +; CHECK-LABEL: @test_eq_for_signed_cmp_with_decompsition( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[V0ADD:%.*]] = add nsw i32 [[V0:%.*]], [[ADDEND0:%.*]] +; CHECK-NEXT: [[V1ADD:%.*]] = add nsw i32 [[V1:%.*]], [[ADDEND1:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2:%.*]], [[V0ADD]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i32 [[V0ADD]], [[V1ADD]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[ADDEND0]], 0 +; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[ADDEND0]], [[ADDEND1]] +; CHECK-NEXT: [[AND0:%.*]] = and i1 [[CMP1]], [[CMP]] +; CHECK-NEXT: [[AND1:%.*]] = and i1 [[AND0]], [[CMP2]] +; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[CMP3]] +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[V1]], [[V2]] +; CHECK-NEXT: [[AND3:%.*]] = and i1 false, [[AND2]] +; CHECK-NEXT: ret i1 [[AND3]] +; +entry: + %v0add = add nsw i32 %v0, %addend0 + %v1add = add nsw i32 %v1, %addend1 + %cmp = icmp eq i32 %v2, %v0add + %cmp1 = icmp sge i32 %v0add, %v1add + %cmp2 = icmp sge i32 %addend0, 0 + %cmp3 = icmp slt i32 %addend0, %addend1 + %and0 = and i1 %cmp1, %cmp + %and1 = and i1 %and0, %cmp2 + %and2 = and i1 %and1, %cmp3 + %cmp4 = icmp sgt i32 %v1, %v2 + %and3 = and i1 %cmp4, %and2 + ret i1 %and3 +} diff --git a/llvm/test/Transforms/ConstraintElimination/ne.ll b/llvm/test/Transforms/ConstraintElimination/ne.ll index 566e73dc8d626e..4753860db2851b 100644 --- a/llvm/test/Transforms/ConstraintElimination/ne.ll +++ b/llvm/test/Transforms/ConstraintElimination/ne.ll @@ -71,8 +71,7 @@ define i1 @test_ne_eq_0(i8 %a, i8 %b) { ; CHECK-NEXT: [[RES_13:%.*]] = xor i1 [[RES_12]], false ; CHECK-NEXT: [[RES_14:%.*]] = xor i1 [[RES_13]], false ; CHECK-NEXT: [[RES_15:%.*]] = xor i1 [[RES_14]], false -; CHECK-NEXT: [[C_12:%.*]] = icmp sgt i8 [[A]], 0 -; CHECK-NEXT: [[RES_16:%.*]] = xor i1 [[RES_15]], [[C_12]] +; CHECK-NEXT: [[RES_16:%.*]] = xor i1 [[RES_15]], false ; CHECK-NEXT: ret i1 [[RES_16]] ; entry: @@ -209,8 +208,7 @@ define i1 @test_ne_eq_1(i8 %a, i8 %b) { ; CHECK-NEXT: [[RES_13:%.*]] = xor i1 [[RES_12]], true ; CHECK-NEXT: [[RES_14:%.*]] = xor i1 [[RES_13]], true ; CHECK-NEXT: [[RES_15:%.*]] = xor i1 [[RES_14]], false -; CHECK-NEXT: [[C_12:%.*]] = icmp sgt i8 [[A]], 0 -; CHECK-NEXT: [[RES_16:%.*]] = xor i1 [[RES_15]], [[C_12]] +; CHECK-NEXT: [[RES_16:%.*]] = xor i1 [[RES_15]], true ; CHECK-NEXT: ret i1 [[RES_16]] ; entry: diff --git a/llvm/test/Transforms/ConstraintElimination/pr105785.ll b/llvm/test/Transforms/ConstraintElimination/pr105785.ll index 6c340a11dd2e2c..83b7461720f09c 100644 --- a/llvm/test/Transforms/ConstraintElimination/pr105785.ll +++ b/llvm/test/Transforms/ConstraintElimination/pr105785.ll @@ -15,8 +15,7 @@ define void @pr105785(ptr %p) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[FOR_IND2]], 3 ; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3]], label %[[FOR_COND]] ; CHECK: [[FOR_BODY3]]: -; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[FOR_IND]], i32 1) -; CHECK-NEXT: store i32 [[SCMP]], ptr [[P]], align 4 +; CHECK-NEXT: store i32 -1, ptr [[P]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[FOR_IND2]], 1 ; CHECK-NEXT: br label %[[FOR_COND1]] ; CHECK: [[FOR_END6]]: From 08195f31ab1c484ad59dea125bfd61316a07eee8 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 23 Jan 2025 11:01:58 +0100 Subject: [PATCH 113/208] [libc++] Inline basic_streambuf functions (#123379) Most of the `basic_streambuf` functions are really simple, which makes most of the implementation when they are out of line boilerplate. --- libcxx/include/streambuf | 256 +++++++++++++++------------------------ 1 file changed, 97 insertions(+), 159 deletions(-) diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf index 7f02a9b3314110..3c4e9086e05ecb 100644 --- a/libcxx/include/streambuf +++ b/libcxx/include/streambuf @@ -146,7 +146,7 @@ public: static_assert(is_same<_CharT, typename traits_type::char_type>::value, "traits_type::char_type must be the same type as CharT"); - virtual ~basic_streambuf(); + virtual ~basic_streambuf() {} // 27.6.2.2.1 locales: inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 locale pubimbue(const locale& __loc) { @@ -229,10 +229,36 @@ public: } protected: - basic_streambuf(); - basic_streambuf(const basic_streambuf& __rhs); - basic_streambuf& operator=(const basic_streambuf& __rhs); - void swap(basic_streambuf& __rhs); + basic_streambuf() {} + basic_streambuf(const basic_streambuf& __sb) + : __loc_(__sb.__loc_), + __binp_(__sb.__binp_), + __ninp_(__sb.__ninp_), + __einp_(__sb.__einp_), + __bout_(__sb.__bout_), + __nout_(__sb.__nout_), + __eout_(__sb.__eout_) {} + + basic_streambuf& operator=(const basic_streambuf& __sb) { + __loc_ = __sb.__loc_; + __binp_ = __sb.__binp_; + __ninp_ = __sb.__ninp_; + __einp_ = __sb.__einp_; + __bout_ = __sb.__bout_; + __nout_ = __sb.__nout_; + __eout_ = __sb.__eout_; + return *this; + } + + void swap(basic_streambuf& __sb) { + std::swap(__loc_, __sb.__loc_); + std::swap(__binp_, __sb.__binp_); + std::swap(__ninp_, __sb.__ninp_); + std::swap(__einp_, __sb.__einp_); + std::swap(__bout_, __sb.__bout_); + std::swap(__nout_, __sb.__nout_); + std::swap(__eout_, __sb.__eout_); + } // 27.6.2.3.2 Get area: _LIBCPP_HIDE_FROM_ABI char_type* eback() const { return __binp_; } @@ -267,173 +293,85 @@ protected: // 27.6.2.4 virtual functions: // 27.6.2.4.1 Locales: - virtual void imbue(const locale& __loc); + virtual void imbue(const locale&) {} // 27.6.2.4.2 Buffer management and positioning: - virtual basic_streambuf* setbuf(char_type* __s, streamsize __n); - virtual pos_type - seekoff(off_type __off, ios_base::seekdir __way, ios_base::openmode __which = ios_base::in | ios_base::out); - virtual pos_type seekpos(pos_type __sp, ios_base::openmode __which = ios_base::in | ios_base::out); - virtual int sync(); + virtual basic_streambuf* setbuf(char_type*, streamsize) { return this; } + virtual pos_type seekoff(off_type, ios_base::seekdir, ios_base::openmode = ios_base::in | ios_base::out) { + return pos_type(off_type(-1)); + } + virtual pos_type seekpos(pos_type, ios_base::openmode = ios_base::in | ios_base::out) { + return pos_type(off_type(-1)); + } + virtual int sync() { return 0; } // 27.6.2.4.3 Get area: - virtual streamsize showmanyc(); - virtual streamsize xsgetn(char_type* __s, streamsize __n); - virtual int_type underflow(); - virtual int_type uflow(); + virtual streamsize showmanyc() { return 0; } + + virtual streamsize xsgetn(char_type* __s, streamsize __n) { + const int_type __eof = traits_type::eof(); + int_type __c; + streamsize __i = 0; + while (__i < __n) { + if (__ninp_ < __einp_) { + const streamsize __len = std::min(static_cast(INT_MAX), std::min(__einp_ - __ninp_, __n - __i)); + traits_type::copy(__s, __ninp_, __len); + __s += __len; + __i += __len; + this->gbump(__len); + } else if ((__c = uflow()) != __eof) { + *__s = traits_type::to_char_type(__c); + ++__s; + ++__i; + } else + break; + } + return __i; + } + + virtual int_type underflow() { return traits_type::eof(); } + virtual int_type uflow() { + if (underflow() == traits_type::eof()) + return traits_type::eof(); + return traits_type::to_int_type(*__ninp_++); + } // 27.6.2.4.4 Putback: - virtual int_type pbackfail(int_type __c = traits_type::eof()); + virtual int_type pbackfail(int_type = traits_type::eof()) { return traits_type::eof(); } // 27.6.2.4.5 Put area: - virtual streamsize xsputn(const char_type* __s, streamsize __n); - virtual int_type overflow(int_type __c = traits_type::eof()); + virtual streamsize xsputn(const char_type* __s, streamsize __n) { + streamsize __i = 0; + int_type __eof = traits_type::eof(); + while (__i < __n) { + if (__nout_ >= __eout_) { + if (overflow(traits_type::to_int_type(*__s)) == __eof) + break; + ++__s; + ++__i; + } else { + streamsize __chunk_size = std::min(__eout_ - __nout_, __n - __i); + traits_type::copy(__nout_, __s, __chunk_size); + __nout_ += __chunk_size; + __s += __chunk_size; + __i += __chunk_size; + } + } + return __i; + } + + virtual int_type overflow(int_type = traits_type::eof()) { return traits_type::eof(); } private: locale __loc_; - char_type* __binp_; - char_type* __ninp_; - char_type* __einp_; - char_type* __bout_; - char_type* __nout_; - char_type* __eout_; + char_type* __binp_ = nullptr; + char_type* __ninp_ = nullptr; + char_type* __einp_ = nullptr; + char_type* __bout_ = nullptr; + char_type* __nout_ = nullptr; + char_type* __eout_ = nullptr; }; -template -basic_streambuf<_CharT, _Traits>::~basic_streambuf() {} - -template -basic_streambuf<_CharT, _Traits>::basic_streambuf() - : __binp_(nullptr), __ninp_(nullptr), __einp_(nullptr), __bout_(nullptr), __nout_(nullptr), __eout_(nullptr) {} - -template -basic_streambuf<_CharT, _Traits>::basic_streambuf(const basic_streambuf& __sb) - : __loc_(__sb.__loc_), - __binp_(__sb.__binp_), - __ninp_(__sb.__ninp_), - __einp_(__sb.__einp_), - __bout_(__sb.__bout_), - __nout_(__sb.__nout_), - __eout_(__sb.__eout_) {} - -template -basic_streambuf<_CharT, _Traits>& basic_streambuf<_CharT, _Traits>::operator=(const basic_streambuf& __sb) { - __loc_ = __sb.__loc_; - __binp_ = __sb.__binp_; - __ninp_ = __sb.__ninp_; - __einp_ = __sb.__einp_; - __bout_ = __sb.__bout_; - __nout_ = __sb.__nout_; - __eout_ = __sb.__eout_; - return *this; -} - -template -void basic_streambuf<_CharT, _Traits>::swap(basic_streambuf& __sb) { - std::swap(__loc_, __sb.__loc_); - std::swap(__binp_, __sb.__binp_); - std::swap(__ninp_, __sb.__ninp_); - std::swap(__einp_, __sb.__einp_); - std::swap(__bout_, __sb.__bout_); - std::swap(__nout_, __sb.__nout_); - std::swap(__eout_, __sb.__eout_); -} - -template -void basic_streambuf<_CharT, _Traits>::imbue(const locale&) {} - -template -basic_streambuf<_CharT, _Traits>* basic_streambuf<_CharT, _Traits>::setbuf(char_type*, streamsize) { - return this; -} - -template -typename basic_streambuf<_CharT, _Traits>::pos_type -basic_streambuf<_CharT, _Traits>::seekoff(off_type, ios_base::seekdir, ios_base::openmode) { - return pos_type(off_type(-1)); -} - -template -typename basic_streambuf<_CharT, _Traits>::pos_type -basic_streambuf<_CharT, _Traits>::seekpos(pos_type, ios_base::openmode) { - return pos_type(off_type(-1)); -} - -template -int basic_streambuf<_CharT, _Traits>::sync() { - return 0; -} - -template -streamsize basic_streambuf<_CharT, _Traits>::showmanyc() { - return 0; -} - -template -streamsize basic_streambuf<_CharT, _Traits>::xsgetn(char_type* __s, streamsize __n) { - const int_type __eof = traits_type::eof(); - int_type __c; - streamsize __i = 0; - while (__i < __n) { - if (__ninp_ < __einp_) { - const streamsize __len = std::min(static_cast(INT_MAX), std::min(__einp_ - __ninp_, __n - __i)); - traits_type::copy(__s, __ninp_, __len); - __s += __len; - __i += __len; - this->gbump(__len); - } else if ((__c = uflow()) != __eof) { - *__s = traits_type::to_char_type(__c); - ++__s; - ++__i; - } else - break; - } - return __i; -} - -template -typename basic_streambuf<_CharT, _Traits>::int_type basic_streambuf<_CharT, _Traits>::underflow() { - return traits_type::eof(); -} - -template -typename basic_streambuf<_CharT, _Traits>::int_type basic_streambuf<_CharT, _Traits>::uflow() { - if (underflow() == traits_type::eof()) - return traits_type::eof(); - return traits_type::to_int_type(*__ninp_++); -} - -template -typename basic_streambuf<_CharT, _Traits>::int_type basic_streambuf<_CharT, _Traits>::pbackfail(int_type) { - return traits_type::eof(); -} - -template -streamsize basic_streambuf<_CharT, _Traits>::xsputn(const char_type* __s, streamsize __n) { - streamsize __i = 0; - int_type __eof = traits_type::eof(); - while (__i < __n) { - if (__nout_ >= __eout_) { - if (overflow(traits_type::to_int_type(*__s)) == __eof) - break; - ++__s; - ++__i; - } else { - streamsize __chunk_size = std::min(__eout_ - __nout_, __n - __i); - traits_type::copy(__nout_, __s, __chunk_size); - __nout_ += __chunk_size; - __s += __chunk_size; - __i += __chunk_size; - } - } - return __i; -} - -template -typename basic_streambuf<_CharT, _Traits>::int_type basic_streambuf<_CharT, _Traits>::overflow(int_type) { - return traits_type::eof(); -} - extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_streambuf; # if _LIBCPP_HAS_WIDE_CHARACTERS From 8388040fc9e75d49cd000b3371e2610c6c3548ba Mon Sep 17 00:00:00 2001 From: Jack Frankland Date: Thu, 23 Jan 2025 10:14:00 +0000 Subject: [PATCH 114/208] [mlir][tosa] Add NaN Propagation Mode Support (#121951) The TOSA-V1.0 specification adds "nan propagation" modes as attributes for several operators. Adjust the ODS definitions of the relevant operations to include this attribute. The defined modes are "PROPAGATE" and "IGNORE" and the PROPAGATE mode is set by default. MAXIMUM, MINIMUM, REDUCE_MAX, REDUCE_MIN, MAX_POOL, CLAMP, and ARGMAX support this attribute. Signed-off-by: Jack Frankland Co-authored-by: TatWai Chong --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 21 +++-- .../mlir/Dialect/Tosa/IR/TosaTypesBase.td | 8 ++ .../Dialect/Tosa/IR/TosaCanonicalizations.cpp | 85 +++++++++++++++---- mlir/test/Dialect/Tosa/canonicalize.mlir | 52 ++++++++++++ mlir/test/Dialect/Tosa/ops.mlir | 14 +++ 5 files changed, 156 insertions(+), 24 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 2953e006bbe8d1..92ab729f5b933a 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -42,7 +42,8 @@ def Tosa_ArgMaxOp : Tosa_InferShapedTypeOp<"argmax"> { let arguments = (ins Tosa_Tensor: $input, - I32Attr: $axis + I32Attr: $axis, + DefaultValuedAttr:$nan_mode ); let results = (outs @@ -287,7 +288,8 @@ def Tosa_MaxPool2dOp : Tosa_InferShapedTypeOp<"max_pool2d"> { Tosa_IntArrayAttr2:$kernel, Tosa_IntArrayAttr2:$stride, - Tosa_IntArrayAttr4:$pad + Tosa_IntArrayAttr4:$pad, + DefaultValuedAttr:$nan_mode ); let results = (outs @@ -388,7 +390,8 @@ def Tosa_ClampOp : Tosa_ElementwiseUnaryOp<"clamp"> { I64Attr:$min_int, I64Attr:$max_int, Tosa_FloatAttr:$min_fp, - Tosa_FloatAttr:$max_fp + Tosa_FloatAttr:$max_fp, + DefaultValuedAttr:$nan_mode ); let results = (outs @@ -752,7 +755,8 @@ def Tosa_MaximumOp : Tosa_ElementwiseOp<"maximum", [ let arguments = (ins Tosa_Tensor:$input1, - Tosa_Tensor:$input2 + Tosa_Tensor:$input2, + DefaultValuedAttr:$nan_mode ); let results = (outs @@ -775,7 +779,8 @@ def Tosa_MinimumOp : Tosa_ElementwiseOp<"minimum", [ let arguments = (ins Tosa_Tensor:$input1, - Tosa_Tensor:$input2 + Tosa_Tensor:$input2, + DefaultValuedAttr:$nan_mode ); let results = (outs @@ -1382,7 +1387,8 @@ def Tosa_ReduceMaxOp : Tosa_InferTensorTypeOp<"reduce_max"> { let arguments = (ins Tosa_Tensor:$input, - I32Attr:$axis + I32Attr:$axis, + DefaultValuedAttr:$nan_mode ); let results = (outs @@ -1417,7 +1423,8 @@ def Tosa_ReduceMinOp : Tosa_InferTensorTypeOp<"reduce_min"> { let arguments = (ins Tosa_Tensor:$input, - I32Attr:$axis + I32Attr:$axis, + DefaultValuedAttr:$nan_mode ); let results = (outs diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index 13325fb0ab9a20..5693acf3a01db4 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -205,12 +205,20 @@ def Tosa_FloatAttr : Attr($_self)">, //===----------------------------------------------------------------------===// // Iterable attributes. //===----------------------------------------------------------------------===// +// Defined in `section 3. Enumerations` of the TOSA specification. + // Supported regimes for tosa.resize. def Tosa_ResizeTypeAttr : StringBasedAttr< CPred<"::llvm::cast($_self).getValue() == \"BILINEAR\" || " # "::llvm::cast($_self).getValue() == \"NEAREST_NEIGHBOR\"">, "Supported resize/upsampling strategies">; +// Supported NaN propagation strategies. +def Tosa_NanPropagationAttr : StringBasedAttr< + CPred<"::llvm::cast($_self).getValue() == \"PROPAGATE\" || " # + "::llvm::cast($_self).getValue() == \"IGNORE\"">, + "Supported NaN propagation strategies">; + def Tosa_TensorTypeAttr : TypeAttrBase<"TensorType", "Tensor type attribute">; // Tensor to buffer types. diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index f7a596f1ccb192..8b883487d1659b 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -339,33 +339,84 @@ struct ClampIsNoOp : public OpRewritePattern { } }; +// Attempts the following transformation: +// +// For integers a, b, a', and b' such that [a, b] ∩ [a', b'] ≠ ∅ and input +// tensor X the following identity holds: +// +// CLAMP(CLAMP(X, a, b), a', b') = CLAMP(X, max(a, a'), min(b, b')) +// +// subject to the following valid NaN propagation semantics: +// -------------------------------------------- +// | OUTER CLAMP | INNER CLAMP | RESULT MODE | +// |-------------|--------------|-------------| +// | PROPAGATE | PROPAGATE | PROPAGATE | +// | PROPAGATE | IGNORE | IGNORE | +// | IGNORE | PROPAGATE | INVALID | +// | IGNORE | IGNORE | IGNORE | +// |------------------------------------------| + struct ClampClampOptimization : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; + // Helper structure to describe the range of a clamp operation. + template + struct ClampRange { + ClampRange(const T &start, const T &end) : start(start), end(end) {} + T start; + T end; + + // Helper function to determine if two Clamp ranges intersect. + bool intersects(const ClampRange &otherRange) { + return start < otherRange.end && otherRange.start < end; + } + }; + LogicalResult matchAndRewrite(tosa::ClampOp op, PatternRewriter &rewriter) const override { - Value input = op.getInput(); - - Operation *definingOp = input.getDefiningOp(); - if (!definingOp) + // Check the input to the CLAMP op is itself a CLAMP. + auto clampOp = + dyn_cast_if_present(op.getInput().getDefiningOp()); + if (!clampOp) return failure(); - if (tosa::ClampOp clampOp = dyn_cast(definingOp)) { - auto minFp = std::max(op.getMinFp(), clampOp.getMinFp()).convertToFloat(); - auto maxFp = std::min(op.getMaxFp(), clampOp.getMaxFp()).convertToFloat(); + // Check we have a valid NaN propagation combination. + const auto opNanMode = op.getNanMode(); + const auto clampNanMode = clampOp.getNanMode(); + if (opNanMode == "IGNORE" && clampNanMode == "PROPAGATE") + return failure(); - auto minInt = std::max(op.getMinInt(), clampOp.getMinInt()); - auto maxInt = std::min(op.getMaxInt(), clampOp.getMaxInt()); + // Check we have intersecting ranges. + const auto opMinInt = op.getMinInt(); + const auto opMaxInt = op.getMaxInt(); + const auto clampOpMinInt = clampOp.getMinInt(); + const auto clampOpMaxInt = clampOp.getMaxInt(); + ClampRange opRangeIntRange(opMinInt, opMaxInt); + ClampRange clampRangeIntRange(clampOpMinInt, clampOpMaxInt); + if (!opRangeIntRange.intersects(clampRangeIntRange)) + return failure(); - rewriter.replaceOpWithNewOp( - op, op.getType(), clampOp.getInput(), - rewriter.getI64IntegerAttr(minInt), - rewriter.getI64IntegerAttr(maxInt), rewriter.getF32FloatAttr(minFp), - rewriter.getF32FloatAttr(maxFp)); - return success(); - } + const auto opMinFloat = op.getMinFp(); + const auto opMaxFloat = op.getMaxFp(); + const auto clampOpMinFloat = clampOp.getMinFp(); + const auto clampOpMaxFloat = clampOp.getMaxFp(); + ClampRange opRangeFloatRange(opMinFloat, opMaxFloat); + ClampRange clampRangeFloatRange(clampOpMinFloat, clampOpMaxFloat); + if (!opRangeFloatRange.intersects(clampRangeFloatRange)) + return failure(); - return failure(); + // Run the transformation. + const auto minFp = std::max(opMinFloat, clampOpMinFloat).convertToFloat(); + const auto maxFp = std::min(opMaxFloat, clampOpMaxFloat).convertToFloat(); + const auto minInt = std::max(opMinInt, clampOpMinInt); + const auto maxInt = std::min(opMaxInt, clampOpMaxInt); + rewriter.replaceOpWithNewOp( + op, op.getType(), clampOp.getInput(), + rewriter.getI64IntegerAttr(minInt), rewriter.getI64IntegerAttr(maxInt), + rewriter.getF32FloatAttr(minFp), rewriter.getF32FloatAttr(maxFp), + rewriter.getStringAttr((opNanMode != clampNanMode) ? "IGNORE" + : opNanMode)); + return success(); } }; diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index e394188e9a9311..6f47f041b9199a 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -138,6 +138,58 @@ func.func @clamp_twice_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { // ----- +// CHECK: @disjoint_clamp_twice_is_not_single_clamp(%[[INPUT:.*]]: tensor<4xi8>) +func.func @disjoint_clamp_twice_is_not_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { + // CHECK: %[[CLAMP_1:.*]] = tosa.clamp %[[INPUT]] {max_fp = -5.000000e+00 : f32, max_int = -5 : i64, min_fp = -1.000000e+00 : f32, min_int = -10 : i64} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK-NEXT: tosa.clamp %[[CLAMP_1]] {max_fp = 5.000000e+00 : f32, max_int = 5 : i64, min_fp = 1.000000e+00 : f32, min_int = 1 : i64} : (tensor<4xi8>) -> tensor<4xi8> + %0 = tosa.clamp %arg0 {max_fp = -5.0 : f32, max_int = -5 : i64, min_fp = -1.0 : f32, min_int = -10 : i64} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 5 : i64, min_fp = 1.0 : f32, min_int = 1 : i64} : (tensor<4xi8>) -> tensor<4xi8> + return %1 : tensor<4xi8> +} + +// ----- + +// CHECK-LABEL: @clamp_twice_with_nan_propagate_is_single_clamp +func.func @clamp_twice_with_nan_propagate_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { + // CHECK: tosa.clamp %arg0 {max_fp = 3.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -2 : i64} + %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + return %1 : tensor<4xi8> +} + +// ----- + +// CHECK-LABEL: @clamp_twice_with_nan_ignore_is_single_clamp +func.func @clamp_twice_with_nan_ignore_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { + // CHECK: tosa.clamp %arg0 {max_fp = 3.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} + %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + return %1 : tensor<4xi8> +} + +// ----- + +// CHECK-LABEL: @clamp_twice_with_nan_ignore_propagate_is_single_clamp +func.func @clamp_twice_with_nan_ignore_propagate_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { + // CHECK: tosa.clamp %arg0 {max_fp = 3.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} + %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + return %1 : tensor<4xi8> +} + +// ----- + +// CHECK: @clamp_twice_with_nan_propagate_ignore_is_not_single_clamp(%[[INPUT:.*]]: tensor<4xi8>) +func.func @clamp_twice_with_nan_propagate_ignore_is_not_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { + // CHECK: %[[CLAMP_1:.*]] = tosa.clamp %[[INPUT]] {max_fp = 3.000000e+00 : f32, max_int = 4 : i64, min_fp = -5.000000e+00 : f32, min_int = -2 : i64} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK-NEXT: tosa.clamp %[[CLAMP_1]] {max_fp = 5.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -4 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + return %1 : tensor<4xi8> +} + +// ----- + // CHECK-LABEL: @concat_fold func.func @concat_fold(%arg0: tensor) -> tensor { // CHECK: return %arg0 diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index 563c5fa457d351..19b93d7611854d 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -180,6 +180,20 @@ func.func @test_clamp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { return %0 : tensor<13x21x3xf32> } +// ----- +// CHECK-LABEL: clamp_propagate +func.func @test_clamp_propagate(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64, nan_mode = "PROPAGATE"} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +// CHECK-LABEL: clamp_ignore +func.func @test_clamp_ignore(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64, nan_mode = "IGNORE"} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + // ----- // CHECK-LABEL: clamp_f16 func.func @test_clamp_f16(%arg0: tensor<13x21x3xf16>) -> tensor<13x21x3xf16> { From 19306351a2c45e266fa11b41eb1362b20b6ca56d Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Thu, 23 Jan 2025 10:22:07 +0000 Subject: [PATCH 115/208] [clang][Modules] Raise empty.modulemap expected size to <70KB to fix RISC-V failure (#123959) I'm not sure why the test is larger for RISC-V than other targets, but we saw this before with #111360. The file is just over the current 60KB limit: ``` 62772 /home/asb/llvm-project/build/stage2/tools/clang/test/Modules/Output/empty.modulemap.tmp/base.pcm ``` --- clang/test/Modules/empty.modulemap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/Modules/empty.modulemap b/clang/test/Modules/empty.modulemap index f2d37c19d77bcc..8cad8b67b91155 100644 --- a/clang/test/Modules/empty.modulemap +++ b/clang/test/Modules/empty.modulemap @@ -13,8 +13,8 @@ // The module file should be identical each time we produce it. // RUN: diff %t/base.pcm %t/check.pcm // -// We expect an empty module to be less than 60KB (and at least 10K, for now). +// We expect an empty module to be less than 70KB (and at least 10K, for now). // RUN: wc -c %t/base.pcm | FileCheck --check-prefix=CHECK-SIZE %s -// CHECK-SIZE: {{(^|[^0-9])[1-5][0-9][0-9][0-9][0-9]($|[^0-9])}} +// CHECK-SIZE: {{(^|[^0-9])[1-6][0-9][0-9][0-9][0-9]($|[^0-9])}} module empty { header "Inputs/empty.h" export * } From cad6bbade0d7dc57b9c43d9ed8c38260345d50bf Mon Sep 17 00:00:00 2001 From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com> Date: Thu, 23 Jan 2025 10:35:58 +0000 Subject: [PATCH 116/208] [C++20][Modules] Fix crash/compiler error due broken AST links (#123648) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This PR fixes bugreport https://github.com/llvm/llvm-project/issues/122493 The root problem is the same as before lambda function and DeclRefExpr references a variable that does not belong to the same module as the enclosing function body. Therefore iteration over the function body doesn’t visit the VarDecl. Before this change RelatedDeclsMap was created only for canonical decl but in reality it has to be done for the definition of the function that does not always match the canonical decl. Test Plan: check-clang --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/Serialization/ASTReader.h | 17 +++- clang/lib/Serialization/ASTWriterDecl.cpp | 37 +++++--- ...ash-instantiated-in-scope-cxx-modules5.cpp | 92 +++++++++++++++++++ 4 files changed, 132 insertions(+), 16 deletions(-) create mode 100644 clang/test/Headers/crash-instantiated-in-scope-cxx-modules5.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a03f42ab910edd..5989788132c92b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -975,6 +975,8 @@ Bug Fixes to C++ Support - Fixed a nested lambda substitution issue for constraint evaluation. (#GH123441) - Fixed various false diagnostics related to the use of immediate functions. (#GH123472) - Fix immediate escalation not propagating through inherited constructors. (#GH112677) +- Fixed assertions or false compiler diagnostics in the case of C++ modules for + lambda functions or inline friend functions defined inside templates (#GH122493). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 7530015c9dacf3..47301419c76c68 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -546,11 +546,18 @@ class ASTReader /// Mapping from main decl ID to the related decls IDs. /// - /// These related decls have to be loaded right after the main decl. - /// It is required to have canonical declaration for related decls from the - /// same module as the enclosing main decl. Without this, due to lazy - /// deserialization, canonical declarations for the main decl and related can - /// be selected from different modules. + /// The key is the main decl ID, and the value is a vector of related decls + /// that must be loaded immediately after the main decl. This is necessary + /// to ensure that the definition for related decls comes from the same module + /// as the enclosing main decl. Without this, due to lazy deserialization, + /// the definition for the main decl and related decls may come from different + /// modules. It is used for the following cases: + /// - Lambda inside a template function definition: The main declaration is + /// the enclosing function, and the related declarations are the lambda + /// declarations. + /// - Friend function defined inside a template CXXRecord declaration: The + /// main declaration is the enclosing record, and the related declarations + /// are the friend functions. llvm::DenseMap> RelatedDeclsMap; struct PendingUpdateRecord { diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 54570dedb0b227..8b9ba04dce91c4 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -27,6 +27,20 @@ using namespace clang; using namespace serialization; +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +namespace { + +// Helper function that returns true if the decl passed in the argument is +// a defintion in dependent contxt. +template bool isDefinitionInDependentContext(DT *D) { + return D->isDependentContext() && D->isThisDeclarationADefinition(); +} + +} // namespace + //===----------------------------------------------------------------------===// // Declaration serialization //===----------------------------------------------------------------------===// @@ -801,14 +815,14 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { } if (D->getFriendObjectKind()) { - // For a function defined inline within a class template, we have to force - // the canonical definition to be the one inside the canonical definition of - // the template. Remember this relation to deserialize them together. - if (auto *RD = dyn_cast(D->getLexicalParent())) - if (RD->isDependentContext() && RD->isThisDeclarationADefinition()) { - Writer.RelatedDeclsMap[Writer.GetDeclRef(RD)].push_back( - Writer.GetDeclRef(D)); - } + // For a friend function defined inline within a class template, we have to + // force the definition to be the one inside the definition of the template + // class. Remember this relation to deserialize them together. + if (auto *RD = dyn_cast(D->getLexicalParent()); + RD && isDefinitionInDependentContext(RD)) { + Writer.RelatedDeclsMap[Writer.GetDeclRef(RD)].push_back( + Writer.GetDeclRef(D)); + } } Record.push_back(D->param_size()); @@ -1583,9 +1597,10 @@ void ASTDeclWriter::VisitCXXRecordDecl(CXXRecordDecl *D) { } else { Record.push_back(0); } - // For lambdas inside canonical FunctionDecl remember the mapping. - if (auto FD = llvm::dyn_cast_or_null(D->getDeclContext()); - FD && FD->isCanonicalDecl()) { + // For lambdas inside template functions, remember the mapping to + // deserialize them together. + if (auto *FD = llvm::dyn_cast_or_null(D->getDeclContext()); + FD && isDefinitionInDependentContext(FD)) { Writer.RelatedDeclsMap[Writer.GetDeclRef(FD)].push_back( Writer.GetDeclRef(D)); } diff --git a/clang/test/Headers/crash-instantiated-in-scope-cxx-modules5.cpp b/clang/test/Headers/crash-instantiated-in-scope-cxx-modules5.cpp new file mode 100644 index 00000000000000..352e0125fe4342 --- /dev/null +++ b/clang/test/Headers/crash-instantiated-in-scope-cxx-modules5.cpp @@ -0,0 +1,92 @@ +// RUN: rm -fR %t +// RUN: split-file %s %t +// RUN: cd %t +// RUN: %clang_cc1 -verify -std=c++20 -Werror=uninitialized -xc++ -emit-module module.cppmap -fmodule-name=mock_resolver -o mock_resolver.pcm +// RUN: %clang_cc1 -verify -std=c++20 -Werror=uninitialized -xc++ -emit-module module.cppmap -fmodule-name=sql_internal -o sql_internal.pcm +// RUN: %clang_cc1 -verify -std=c++20 -Werror=uninitialized -xc++ -fmodule-file=mock_resolver.pcm -fmodule-file=sql_internal.pcm main.cc -o main.o + +//--- module.cppmap +module "mock_resolver" { + export * + module "mock_resolver.h" { + export * + header "mock_resolver.h" + } +} + +module "sql_internal" { + export * + module "sql_transform_builder.h" { + export * + header "sql_transform_builder.h" + } +} + +//--- set_bits2.h +// expected-no-diagnostics +#pragma once + +template +void fwd(const T& x) {} + +namespace vox::bitset { + +template +void ForEachSetBit2(const TFunc&) { + fwd([](int) { + const int bit_index_base = 0; + (void)[&](int) { + int v = bit_index_base; + }; + }); +} + +} // namespace vox::bitset + +//--- sql_transform_builder.h +// expected-no-diagnostics +#pragma once + +#include "set_bits2.h" + +class QualifyingSet3 { + public: + void GetIndexes() const { + vox::bitset::ForEachSetBit2([]() {}); + } +}; + +template +void DoTransform() { + vox::bitset::ForEachSetBit2([]() {}); +} + +//--- mock_resolver.h +// expected-no-diagnostics +#pragma once +#include "set_bits2.h" + +class QualifyingSet2 { + public: + void GetIndexes() const { + vox::bitset::ForEachSetBit2([]() {}); + } +}; + +//--- main.cc +// expected-no-diagnostics +#include "sql_transform_builder.h" + +template +void get(const Callable& fn) { + fwd(fn); +} + +namespace { + +void test() { + get([]() {}); + DoTransform(); +} + +} // namespace From 2e6cc79f816d942ab09d6a310cd925c1da148aa9 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Thu, 23 Jan 2025 16:15:52 +0530 Subject: [PATCH 117/208] [MLIR][NVVM] Migrate CpAsyncOp to intrinsics (#123789) Intrinsics are available for the 'cpSize' variants also. So, this patch migrates the Op to lower to the intrinsics for all cases. * Update the existing tests to check the lowering to intrinsics. * Add newer cp_async_zfill tests to verify the lowering for the 'cpSize' variants. * Tidy-up CHECK lines in cp_async() function in nvvmir.mlir (NFC) PTX spec link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async Signed-off-by: Durgadoss R --- .../include/mlir/Dialect/LLVMIR/NVVMDialect.h | 1 + mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 49 ++++--------------- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 38 ++++++++++++++ .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 8 +-- mlir/test/Target/LLVMIR/nvvmir.mlir | 26 +++++++--- 5 files changed, 70 insertions(+), 52 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h index 50d1a39126ea3e..d474ba8485d5d8 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h @@ -21,6 +21,7 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/InferIntRangeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "mlir/Dialect/LLVMIR/NVVMOpsEnums.h.inc" diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 797a0067081314..8c8e44a054a627 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -849,55 +849,24 @@ def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind", def LoadCacheModifierAttr : EnumAttr; -def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">, +def NVVM_CpAsyncOp : NVVM_Op<"cp.async.shared.global">, Arguments<(ins LLVM_PointerShared:$dst, LLVM_PointerGlobal:$src, I32Attr:$size, LoadCacheModifierAttr:$modifier, Optional:$cpSize)> { - string llvmBuilder = [{ - llvm::Intrinsic::ID id; - switch ($size) { - case 4: - id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_4; - break; - case 8: - id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_8; - break; - case 16: - if($modifier == NVVM::LoadCacheModifierKind::CG) - id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16; - else if($modifier == NVVM::LoadCacheModifierKind::CA) - id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16; - else - llvm_unreachable("unsupported cache modifier"); - break; - default: - llvm_unreachable("unsupported async copy size"); - } - createIntrinsicCall(builder, id, {$dst, $src}); - }]; let assemblyFormat = "$dst `,` $src `,` $size `,` `cache` `=` $modifier (`,` $cpSize^)? attr-dict `:` type(operands)"; let hasVerifier = 1; let extraClassDeclaration = [{ - bool hasIntrinsic() { if(getCpSize()) return false; return true; } - - void getAsmValues(RewriterBase &rewriter, - llvm::SmallVectorImpl> &asmValues) { - asmValues.push_back({getDst(), PTXRegisterMod::Read}); - asmValues.push_back({getSrc(), PTXRegisterMod::Read}); - asmValues.push_back({makeConstantI32(rewriter, getSize()), PTXRegisterMod::Read}); - asmValues.push_back({getCpSize(), PTXRegisterMod::Read}); - } + static llvm::Intrinsic::ID + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::SmallVector &args); }]; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { - if(getModifier() == NVVM::LoadCacheModifierKind::CG) - return std::string("cp.async.cg.shared.global [%0], [%1], %2, %3;\n"); - if(getModifier() == NVVM::LoadCacheModifierKind::CA) - return std::string("cp.async.ca.shared.global [%0], [%1], %2, %3;\n"); - llvm_unreachable("unsupported cache modifier"); - } + string llvmBuilder = [{ + llvm::SmallVector translatedOperands; + auto id = NVVM::CpAsyncOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, translatedOperands); + createIntrinsicCall(builder, id, translatedOperands); }]; } diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index ccb5ad05f0bf72..dc7e724379ed05 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1110,6 +1110,44 @@ LogicalResult NVVM::BarrierOp::verify() { return success(); } +#define CP_ASYNC_ID_IMPL(mod, size, suffix) \ + llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix + +#define GET_CP_ASYNC_ID(mod, size, has_cpsize) \ + has_cpsize ? CP_ASYNC_ID_IMPL(mod, size, _s) : CP_ASYNC_ID_IMPL(mod, size, ) + +llvm::Intrinsic::ID +CpAsyncOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::SmallVector &args) { + llvm::Intrinsic::ID id; + + auto cpAsyncOp = cast(op); + bool hasCpSize = cpAsyncOp.getCpSize() ? true : false; + switch (cpAsyncOp.getSize()) { + case 4: + id = GET_CP_ASYNC_ID(ca, 4, hasCpSize); + break; + case 8: + id = GET_CP_ASYNC_ID(ca, 8, hasCpSize); + break; + case 16: + id = (cpAsyncOp.getModifier() == NVVM::LoadCacheModifierKind::CG) + ? GET_CP_ASYNC_ID(cg, 16, hasCpSize) + : GET_CP_ASYNC_ID(ca, 16, hasCpSize); + break; + default: + llvm_unreachable("Invalid copy size in CpAsyncOp."); + } + + // Fill the Intrinsic Args + args.push_back(mt.lookupValue(cpAsyncOp.getDst())); + args.push_back(mt.lookupValue(cpAsyncOp.getSrc())); + if (hasCpSize) + args.push_back(mt.lookupValue(cpAsyncOp.getCpSize())); + + return id; +} + llvm::Intrinsic::ID CpAsyncBulkTensorPrefetchOp::getIntrinsicID(int tensorDims, bool isIm2Col) { switch (tensorDims) { diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 84ea55ceb5acc2..c7a6eca1582768 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -74,13 +74,9 @@ func.func @async_cp(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>) { // CHECK-LABEL: @async_cp_zfill func.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att - // CHECK-SAME: "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", - // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> () + // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = cg, %{{.*}} : !llvm.ptr<3>, !llvm.ptr<1>, i32 nvvm.cp.async.shared.global %dst, %src, 16, cache = cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att - // CHECK-SAME: "cp.async.ca.shared.global [$0], [$1], $2, $3;\0A", - // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> () + // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 4, cache = ca, %{{.*}} : !llvm.ptr<3>, !llvm.ptr<1>, i32 nvvm.cp.async.shared.global %dst, %src, 4, cache = ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 return } diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 09e98765413f0c..7dad9a403def0e 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -488,21 +488,35 @@ llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : // CHECK-LABEL: @cp_async llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) { -// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) + // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) nvvm.cp.async.shared.global %arg0, %arg1, 4, cache = ca : !llvm.ptr<3>, !llvm.ptr<1> -// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) + // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) nvvm.cp.async.shared.global %arg0, %arg1, 8, cache = ca : !llvm.ptr<3>, !llvm.ptr<1> -// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) + // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1> -// CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) + // CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = cg : !llvm.ptr<3>, !llvm.ptr<1> -// CHECK: call void @llvm.nvvm.cp.async.commit.group() + + // CHECK: call void @llvm.nvvm.cp.async.commit.group() nvvm.cp.async.commit.group -// CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0) + // CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0) nvvm.cp.async.wait.group 0 llvm.return } +// CHECK-LABEL: @async_cp_zfill +llvm.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) { + // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}, i32 %{{.*}}) + nvvm.cp.async.shared.global %dst, %src, 4, cache = ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 + // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8.s(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}, i32 %{{.*}}) + nvvm.cp.async.shared.global %dst, %src, 8, cache = ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 + // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16.s(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}, i32 %{{.*}}) + nvvm.cp.async.shared.global %dst, %src, 16, cache = ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 + // CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16.s(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}, i32 %{{.*}}) + nvvm.cp.async.shared.global %dst, %src, 16, cache = cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 + llvm.return +} + // CHECK-LABEL: @cp_async_mbarrier_arrive llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) { // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %{{.*}}) From cb714e74cc0efd5bfdb3e5e80978239425bd83d4 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 23 Jan 2025 10:47:15 +0000 Subject: [PATCH 118/208] [DebugInfo][InstrRef] Avoid producing broken DW_OP_deref_sizes (#123967) We use variable locations such as DBG_VALUE $xmm0 as shorthand to refer to "the low lane of $xmm0", and this is reflected in how DWARF is interpreted too. However InstrRefBasedLDV tries to be smart and interprets such a DBG_VALUE as a 128-bit reference. We then issue a DW_OP_deref_size of 128 bits to the stack, which isn't permitted by DWARF (it's larger than a pointer). Solve this for now by not using DW_OP_deref_size if it would be illegal. Instead we'll use DW_OP_deref, and the consumer will load the variable type from the stack, which should be correct. There's still a risk of imprecision when LLVM decides to use smaller or larger value types than the source-variable type, which manifests as too-little or too-much memory being read from the stack. However we can't solve that without putting more type information in debug-info. fixes #64093 --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 21 ++++ .../deref-spills-with-size-too-big.mir | 107 ++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 llvm/test/DebugInfo/MIR/InstrRef/deref-spills-with-size-too-big.mir diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 012bc37dd767a3..2510b77c6d5be4 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -1290,6 +1290,27 @@ MLocTracker::emitLoc(const SmallVectorImpl &DbgOps, } } + // https://github.com/llvm/llvm-project/issues/64093 + // in particular #issuecomment-2531264124. We use variable locations + // such as DBG_VALUE $xmm0 as shorthand to refer to "the low lane of + // $xmm0", and this is reflected in how DWARF is interpreted too. + // However InstrRefBasedLDV tries to be smart and interprets such a + // DBG_VALUE as a 128-bit reference. We then issue a DW_OP_deref_size + // of 128 bits to the stack, which isn't permitted by DWARF (it's + // larger than a pointer). + // + // Solve this for now by not using DW_OP_deref_size if it would be + // illegal. Instead we'll use DW_OP_deref, and the consumer will load + // the variable type from the stack, which should be correct. + // + // There's still a risk of imprecision when LLVM decides to use + // smaller or larger value types than the source-variable type, which + // manifests as too-little or too-much memory being read from the stack. + // However we can't solve that without putting more type information in + // debug-info. + if (ValueSizeInBits > MF.getTarget().getPointerSizeInBits(0)) + UseDerefSize = false; + SmallVector OffsetOps; TRI.getOffsetOpcodes(Spill.SpillOffset, OffsetOps); bool StackValue = false; diff --git a/llvm/test/DebugInfo/MIR/InstrRef/deref-spills-with-size-too-big.mir b/llvm/test/DebugInfo/MIR/InstrRef/deref-spills-with-size-too-big.mir new file mode 100644 index 00000000000000..49b01dd24ae1de --- /dev/null +++ b/llvm/test/DebugInfo/MIR/InstrRef/deref-spills-with-size-too-big.mir @@ -0,0 +1,107 @@ +# RUN: llc %s -o - -experimental-debug-variable-locations=true \ +# RUN: -run-pass=livedebugvalues \ +# RUN: | FileCheck %s --implicit-check-not=DBG_VALUE +# RUN: llc %s -o - -experimental-debug-variable-locations=true \ +# RUN: -start-before=livedebugvalues -filetype=obj \ +# RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF +# +# LLVM can produce DIExpressions that convert from one value of arbitrary size +# to another. This is normally fine, however that means the value for a +# variable tracked in instruction referencing might not be the same size as the +# variable itself. +# +# We typically use vector registers as shorthand for "the lower lane of the +# vector register", for example if we have a single float we might say +# +# DBG_VALUE $xmm0 +# +# and that's reflected in DWARF too. However, instruction-referencing tries to +# solve several size problems (see deref-spills-with-size.mir), and gets +# confused by this shorthand. It manifests in the test sequence below: we +# locate a variable in a vector register, spill it, then force a stack variable +# location to be produced. InstrRefBasedLDV would like to produce a +# DW_OP_deref_size indicating that 128 bits should be loaded for the 32 bit +# register, but this would be wrong (and illegal DWARF as the max load size is +# the pointer size). +# +# As a sticking-plaster fix: detect when we're about to emit these illegal +# DWARF locations, and instead use DW_OP_deref_size. There's a small risk we +# read too much or too little data, but it's better than emitting illegal DWARF. + +# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "flannel", + +## Check that we're not producing DW_OP_deref_size, instead using the isIndirect +## field of DBG_VALUEs. + +# CHECK: DBG_VALUE $xmm0, $noreg, +# CHECK: DBG_VALUE $rsp, 0, ![[VAR]], !DIExpression(DW_OP_plus_uconst, 8), + +## Check that we produce a breg location with no further expression attached. + +# DWARF: DW_TAG_variable +# DWARF-NEXT: DW_AT_location +# DWARF-NEXT: DW_OP_reg17 XMM0 +# DWARF-NEXT: DW_OP_breg7 RSP+8) +# DWARF-NEXT: DW_AT_name ("flannel") + +--- | + ; ModuleID = 'missingvar.ll' + source_filename = "a" + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + define linkonce_odr void @_ZNSt5dequeIPN4llvm4LoopESaIS2_EE13_M_insert_auxESt15_Deque_iteratorIS2_RS2_PS2_EmRKS2_() local_unnamed_addr align 2 !dbg !3 { + entry: + call void @llvm.dbg.value(metadata i32 0, metadata !8, metadata !DIExpression()), !dbg !7 + call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !7 + ret void + } + + declare void @llvm.dbg.value(metadata, metadata, metadata) + + !llvm.module.flags = !{!0, !9} + !llvm.dbg.cu = !{!1} + + !0 = !{i32 2, !"Debug Info Version", i32 3} + !1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "beards", isOptimized: true, runtimeVersion: 4, emissionKind: FullDebug) + !2 = !DIFile(filename: "bees.cpp", directory: "") + !3 = distinct !DISubprogram(name: "nope", scope: !2, file: !2, line: 1, type: !4, spFlags: DISPFlagDefinition, unit: !1) + !4 = !DISubroutineType(types: !5) + !5 = !{!6} + !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !7 = !DILocation(line: 1, scope: !3) + !8 = !DILocalVariable(name: "flannel", scope: !3, type: !6) + !9 = !{i32 2, !"Dwarf Version", i32 5} + !10 = !DILocalVariable(name: "shoes", scope: !3, type: !11) + !11 = !DIBasicType(name: "long", size: 64, encoding: DW_ATE_signed) + + +... +--- +name: _ZNSt5dequeIPN4llvm4LoopESaIS2_EE13_M_insert_auxESt15_Deque_iteratorIS2_RS2_PS2_EmRKS2_ +alignment: 16 +tracksRegLiveness: true +debugInstrRef: true +liveins: + - { reg: '$rdi' } + - { reg: '$rsi' } + - { reg: '$rdx' } +frameInfo: + stackSize: 16 + offsetAdjustment: -16 + maxAlignment: 16 + maxCallFrameSize: 0 +stack: + - { id: 6, type: spill-slot, offset: -16, size: 16, alignment: 16 } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $rdi, $rdx, $rsi, $rbp, $xmm0 + + + $xmm0 = XORPSrr $xmm0, $xmm0, debug-location !7 + DBG_VALUE $xmm0, $noreg, !8, !DIExpression(), debug-location !7 + VMOVUPSmr $rsp, 1, $noreg, 36, $noreg, $xmm0 :: (store (s128) into %stack.6) + $xmm0 = XORPSrr $xmm0, $xmm0, debug-location !7 + RET64 0, debug-location !7 +... From ad6d808906075c3386bbeada3c37d8d3e6afe248 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 23 Jan 2025 11:16:02 +0000 Subject: [PATCH 119/208] [lldb][DWARFASTParserClang] Make C++ method parsing aware of explicit object parameters (#124096) LLDB deduces the CV-qualifiers and storage class of a C++ method from the object parameter. Currently it assumes that parameter is implicit (and is a pointer type with the name "this"). This isn't true anymore in C++23 with explicit object parameters. To support those we can simply check the `DW_AT_object_pointer` of the subprogram DIE (works for both declarations and definitions) when searching for the object parameter. We can also remove the check for `eEncodingIsPointerUID`, because in C++ an artificial parameter called `this` is only ever the implicit object parameter (at least for all the major compilers). --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 9 +- .../DWARF/DWARFASTParserClangTests.cpp | 178 ++++++++++++++++++ 2 files changed, 181 insertions(+), 6 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index f54b7fc9cdad24..682ee6d287bf5c 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -173,7 +173,9 @@ GetCXXObjectParameter(const DWARFDIE &subprogram, if (!DeclKindIsCXXClass(containing_decl_ctx.getDeclKind())) return {}; - // FIXME: if subprogram has a explicit DW_AT_object_pointer, use it. + if (DWARFDIE object_parameter = + subprogram.GetAttributeValueAsReferenceDIE(DW_AT_object_pointer)) + return object_parameter; // If no DW_AT_object_pointer was specified, assume the implicit object // parameter is the first parameter to the function, is called "this" and is @@ -215,11 +217,6 @@ static unsigned GetCXXMethodCVQuals(const DWARFDIE &subprogram, return 0; uint32_t encoding_mask = this_type->GetEncodingMask(); - - // FIXME: explicit object parameters need not to be pointers - if (!(encoding_mask & (1u << Type::eEncodingIsPointerUID))) - return 0; - unsigned cv_quals = 0; if (encoding_mask & (1u << Type::eEncodingIsConstUID)) cv_quals |= clang::Qualifiers::Const; diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index b31f56aa372d58..9c0300be08a78a 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -902,3 +902,181 @@ TEST_F(DWARFASTParserClangTests, TestParseDWARFAttributes_ObjectPointer) { EXPECT_TRUE(attrs.object_pointer.IsValid()); EXPECT_EQ(attrs.object_pointer, param_die); } + +TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ExplicitObjectParameter) { + // Tests parsing of a C++ non-static member function with an explicit object + // parameter that isn't called "this" and is not a pointer (but a CV-qualified + // rvalue reference instead). + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - Context + - func + - mySelf + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_object_pointer + Form: DW_FORM_ref4 + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Code: 0x4 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x5 + Tag: DW_TAG_rvalue_reference_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x6 + Tag: DW_TAG_const_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x7 + Tag: DW_TAG_volatile_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_structure_type +# DW_AT_name [DW_FORM_strp] ("Context") + + - AbbrCode: 0x2 + Values: + - Value: 0x0 + +# DW_TAG_subprogram +# DW_AT_name [DW_FORM_strp] ("func") +# DW_AT_object_pointer [DW_FORM_ref4] + - AbbrCode: 0x3 + Values: + - Value: 0x8 + - Value: 0x1 + - Value: 0x1d + - Value: 0x1 + +# DW_TAG_formal_parameter +# DW_AT_name [DW_FORM_strp] ("mySelf") +# DW_AT_type [DW_FORM_ref4] (const volatile Context &&) + - AbbrCode: 0x4 + Values: + - Value: 0xd + - Value: 0x28 + + - AbbrCode: 0x0 + - AbbrCode: 0x0 + +# DW_TAG_rvalue_reference_type +# DW_AT_type [DW_FORM_ref4] ("const volatile Context") + + - AbbrCode: 0x5 + Values: + - Value: 0x2d + +# DW_TAG_const_type +# DW_AT_type [DW_FORM_ref4] ("volatile Context") + + - AbbrCode: 0x6 + Values: + - Value: 0x32 + +# DW_TAG_volatile_type +# DW_AT_type [DW_FORM_ref4] ("Context") + + - AbbrCode: 0x7 + Values: + - Value: 0xf + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto ts_or_err = + cu_die.GetDWARF()->GetTypeSystemForLanguage(eLanguageTypeC_plus_plus); + auto *parser = + static_cast((*ts_or_err)->GetDWARFParser()); + + auto context_die = cu_die.GetFirstChild(); + ASSERT_TRUE(context_die.IsValid()); + ASSERT_EQ(context_die.Tag(), DW_TAG_structure_type); + + SymbolContext sc; + bool new_type; + auto context_type_sp = parser->ParseTypeFromDWARF(sc, context_die, &new_type); + ASSERT_NE(context_type_sp, nullptr); + + ASSERT_TRUE( + parser->CompleteTypeFromDWARF(context_die, context_type_sp.get(), + context_type_sp->GetForwardCompilerType())); + + auto *record_decl = llvm::dyn_cast_or_null( + ClangUtil::GetAsTagDecl(context_type_sp->GetForwardCompilerType())); + ASSERT_NE(record_decl, nullptr); + + auto method_it = record_decl->method_begin(); + ASSERT_NE(method_it, record_decl->method_end()); + + // Check that we didn't parse the function as static. + EXPECT_FALSE(method_it->isStatic()); + + // Check that method qualifiers were correctly set. + EXPECT_EQ(method_it->getMethodQualifiers(), + clang::Qualifiers::fromCVRMask(clang::Qualifiers::Const | + clang::Qualifiers::Volatile)); +} From fa7f0e582bc25a91d89dab7c488a1619060f9bef Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:49:44 +0530 Subject: [PATCH 120/208] [NVPTX] Add Bulk Copy Prefetch Intrinsics (#123226) This patch adds NVVM intrinsics and NVPTX codegen for: - cp.async.bulk.prefetch.L2.* variants - These intrinsics optionally support cache_hints as indicated by the boolean flag argument. - Lit tests are added for all combinations of these intrinsics in cp-async-bulk.ll. - The generated PTX is verified with a 12.3 ptxas executable. - Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch Co-authored-by: abmajumder --- llvm/docs/NVPTXUsage.rst | 28 +++++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 11 ++++++++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 22 ++++++++++++++++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 12 +++++++++ llvm/test/CodeGen/NVPTX/cp-async-bulk.ll | 19 ++++++++++++++ 6 files changed, 93 insertions(+) diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 25a230f65fd3dd..a5a78a2882eec3 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -553,6 +553,34 @@ it must be a multiple of 16. For more information, refer PTX ISA ``_. +'``llvm.nvvm.cp.async.bulk.prefetch.L2``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.prefetch.L2``' intrinsic +corresponds to the ``cp.async.bulk.prefetch.L2.*`` family +of PTX instructions. These instructions initiate an asynchronous +prefetch of bulk data from global memory to the L2 cache. +The 32-bit operand ``%size`` specifies the amount of memory to be +prefetched in terms of bytes and it must be a multiple of 16. + +* The last argument to these intrinsics is boolean flag indicating + support for cache_hint. These flag argument must be compile-time + constant. When set, it indicates a valid cache_hint (``i64 %ch``) + and generates the ``.L2::cache_hint`` variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 00a76018d8415d..00c441920bfa1c 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -5033,4 +5033,15 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global NoCapture>, NoCapture>, ImmArg>]>; +// Intrinsics for Bulk Copy Prefetch L2 +def int_nvvm_cp_async_bulk_prefetch_L2 + : DefaultAttrsIntrinsic<[], + [llvm_global_ptr_ty, // src_gmem_ptr + llvm_i32_ty, // copy_size + llvm_i64_ty, // cache_hint + llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, IntrArgMemOnly, + NoCapture>, ReadOnly>, + ImmArg>]>; + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 8f6adf2c22f922..ac8ce05724750c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -3168,6 +3168,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) { ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } +void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // src, size, cache_hint, cache_hint_flag + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {4} + size_t NumOps = N->getNumOperands(); + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumArgs)); + Ops.push_back(N->getOperand(0)); // Chain operand + + unsigned Opcode = IsCacheHint + ? NVPTX::CP_ASYNC_BULK_PREFETCH_CH + : NVPTX::CP_ASYNC_BULK_PREFETCH; + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); using TMARedTy = llvm::nvvm::TMAReductionOp; @@ -3181,6 +3200,9 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global: SelectCpAsyncBulkS2G(N); return true; + case Intrinsic::nvvm_cp_async_bulk_prefetch_L2: + SelectCpAsyncBulkPrefetchL2(N); + return true; case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 7661f153238fcd..8dc6bc86c68281 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -93,6 +93,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectI128toV2I64(SDNode *N); void SelectCpAsyncBulkG2S(SDNode *N); void SelectCpAsyncBulkS2G(SDNode *N); + void SelectCpAsyncBulkPrefetchL2(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 48d75728aef8e2..6198c4aa9b94cb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -547,6 +547,18 @@ multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER { defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER; defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER; +//------------------------------ +// Bulk Copy Prefetch Functions +//------------------------------ +def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs), + (ins Int64Regs:$src, Int32Regs:$size), + "cp.async.bulk.prefetch.L2.global [$src], $size;", []>, + Requires<[hasPTX<80>, hasSM<90>]>; + +def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs), + (ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch), + "cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>, + Requires<[hasPTX<80>, hasSM<90>]>; //------------------------------------- // TMA Async Bulk Tensor Copy Functions //------------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index aefd18a0632a08..cbb53df4a49b09 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -9,6 +9,7 @@ target triple = "nvptx64-nvidia-cuda" declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1) declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1) declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32) +declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1), i32, i64, i1) define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) { ; CHECK-PTX64-LABEL: cp_async_bulk_g2s( @@ -116,3 +117,21 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3 tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size) ret void } + +define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_prefetch( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0) + ret void +} From 17756aa9c9d2f54a29dba3a2805f217cc1723ff0 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Thu, 23 Jan 2025 12:19:52 +0100 Subject: [PATCH 121/208] [Clang] [Release Notes] Implicit lifetimes are a C++23 feature --- clang/docs/ReleaseNotes.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5989788132c92b..5d4b182f29afa0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -294,9 +294,6 @@ C++ Language Changes C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ -- Add ``__builtin_is_implicit_lifetime`` intrinsic, which supports - `P2647R1 A trait for implicit lifetime types `_ - - Add ``__builtin_is_virtual_base_of`` intrinsic, which supports `P2985R0 A type trait for detecting virtual base classes `_ @@ -318,6 +315,9 @@ C++23 Feature Support - ``__cpp_explicit_this_parameter`` is now defined. (#GH82780) +- Add ``__builtin_is_implicit_lifetime`` intrinsic, which supports + `P2674R1 A trait for implicit lifetime types `_ + - Add support for `P2280R4 Using unknown pointers and references in constant expressions `_. (#GH63139) C++20 Feature Support From a8020930a8174d84da04fa91b6fef244207f42f5 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 23 Jan 2025 11:20:14 +0000 Subject: [PATCH 122/208] Revert "[lldb][DWARFASTParserClang] Make C++ method parsing aware of explicit object parameters" (#124100) Reverts llvm/llvm-project#124096 Broke linux CI: ``` Note: This is test shard 7 of 42. [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from DWARFASTParserClangTests [ RUN ] DWARFASTParserClangTests.TestParseSubroutine_ExplicitObjectParameter Expected must be checked before access or destruction. Expected value was in success state. (Note: Expected values in success mode must still be checked prior to being destroyed). Stack dump without symbol names (ensure you have llvm-symbolizer in your PATH or set the environment var `LLVM_SYMBOLIZER_PATH` to point to it): 0 SymbolFileDWARFTests 0x0000560271ee5ba7 1 SymbolFileDWARFTests 0x0000560271ee3a2c 2 SymbolFileDWARFTests 0x0000560271ee63ea 3 libc.so.6 0x00007f3e54e5b050 4 libc.so.6 0x00007f3e54ea9e2c 5 libc.so.6 0x00007f3e54e5afb2 gsignal + 18 6 libc.so.6 0x00007f3e54e45472 abort + 211 7 SymbolFileDWARFTests 0x0000560271e79d51 8 SymbolFileDWARFTests 0x0000560271e724f7 9 SymbolFileDWARFTests 0x0000560271f39e2c 10 SymbolFileDWARFTests 0x0000560271f3b368 11 SymbolFileDWARFTests 0x0000560271f3c053 12 SymbolFileDWARFTests 0x0000560271f4cf67 13 SymbolFileDWARFTests 0x0000560271f4c18a 14 SymbolFileDWARFTests 0x0000560271f2561c 15 libc.so.6 0x00007f3e54e4624a 16 libc.so.6 0x00007f3e54e46305 __libc_start_main + 133 17 SymbolFileDWARFTests 0x0000560271e65161 ``` --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 9 +- .../DWARF/DWARFASTParserClangTests.cpp | 178 ------------------ 2 files changed, 6 insertions(+), 181 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 682ee6d287bf5c..f54b7fc9cdad24 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -173,9 +173,7 @@ GetCXXObjectParameter(const DWARFDIE &subprogram, if (!DeclKindIsCXXClass(containing_decl_ctx.getDeclKind())) return {}; - if (DWARFDIE object_parameter = - subprogram.GetAttributeValueAsReferenceDIE(DW_AT_object_pointer)) - return object_parameter; + // FIXME: if subprogram has a explicit DW_AT_object_pointer, use it. // If no DW_AT_object_pointer was specified, assume the implicit object // parameter is the first parameter to the function, is called "this" and is @@ -217,6 +215,11 @@ static unsigned GetCXXMethodCVQuals(const DWARFDIE &subprogram, return 0; uint32_t encoding_mask = this_type->GetEncodingMask(); + + // FIXME: explicit object parameters need not to be pointers + if (!(encoding_mask & (1u << Type::eEncodingIsPointerUID))) + return 0; + unsigned cv_quals = 0; if (encoding_mask & (1u << Type::eEncodingIsConstUID)) cv_quals |= clang::Qualifiers::Const; diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index 9c0300be08a78a..b31f56aa372d58 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -902,181 +902,3 @@ TEST_F(DWARFASTParserClangTests, TestParseDWARFAttributes_ObjectPointer) { EXPECT_TRUE(attrs.object_pointer.IsValid()); EXPECT_EQ(attrs.object_pointer, param_die); } - -TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ExplicitObjectParameter) { - // Tests parsing of a C++ non-static member function with an explicit object - // parameter that isn't called "this" and is not a pointer (but a CV-qualified - // rvalue reference instead). - - const char *yamldata = R"( ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_EXEC - Machine: EM_AARCH64 -DWARF: - debug_str: - - Context - - func - - mySelf - debug_abbrev: - - ID: 0 - Table: - - Code: 0x1 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_language - Form: DW_FORM_data2 - - Code: 0x2 - Tag: DW_TAG_structure_type - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x3 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_declaration - Form: DW_FORM_flag_present - - Attribute: DW_AT_object_pointer - Form: DW_FORM_ref4 - - Attribute: DW_AT_external - Form: DW_FORM_flag_present - - Code: 0x4 - Tag: DW_TAG_formal_parameter - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_type - Form: DW_FORM_ref4 - - Code: 0x5 - Tag: DW_TAG_rvalue_reference_type - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_type - Form: DW_FORM_ref4 - - Code: 0x6 - Tag: DW_TAG_const_type - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_type - Form: DW_FORM_ref4 - - Code: 0x7 - Tag: DW_TAG_volatile_type - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_type - Form: DW_FORM_ref4 - debug_info: - - Version: 5 - UnitType: DW_UT_compile - AddrSize: 8 - Entries: - -# DW_TAG_compile_unit -# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) - - - AbbrCode: 0x1 - Values: - - Value: 0x04 - -# DW_TAG_structure_type -# DW_AT_name [DW_FORM_strp] ("Context") - - - AbbrCode: 0x2 - Values: - - Value: 0x0 - -# DW_TAG_subprogram -# DW_AT_name [DW_FORM_strp] ("func") -# DW_AT_object_pointer [DW_FORM_ref4] - - AbbrCode: 0x3 - Values: - - Value: 0x8 - - Value: 0x1 - - Value: 0x1d - - Value: 0x1 - -# DW_TAG_formal_parameter -# DW_AT_name [DW_FORM_strp] ("mySelf") -# DW_AT_type [DW_FORM_ref4] (const volatile Context &&) - - AbbrCode: 0x4 - Values: - - Value: 0xd - - Value: 0x28 - - - AbbrCode: 0x0 - - AbbrCode: 0x0 - -# DW_TAG_rvalue_reference_type -# DW_AT_type [DW_FORM_ref4] ("const volatile Context") - - - AbbrCode: 0x5 - Values: - - Value: 0x2d - -# DW_TAG_const_type -# DW_AT_type [DW_FORM_ref4] ("volatile Context") - - - AbbrCode: 0x6 - Values: - - Value: 0x32 - -# DW_TAG_volatile_type -# DW_AT_type [DW_FORM_ref4] ("Context") - - - AbbrCode: 0x7 - Values: - - Value: 0xf - - - AbbrCode: 0x0 -... -)"; - YAMLModuleTester t(yamldata); - - DWARFUnit *unit = t.GetDwarfUnit(); - ASSERT_NE(unit, nullptr); - const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); - ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); - ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); - DWARFDIE cu_die(unit, cu_entry); - - auto ts_or_err = - cu_die.GetDWARF()->GetTypeSystemForLanguage(eLanguageTypeC_plus_plus); - auto *parser = - static_cast((*ts_or_err)->GetDWARFParser()); - - auto context_die = cu_die.GetFirstChild(); - ASSERT_TRUE(context_die.IsValid()); - ASSERT_EQ(context_die.Tag(), DW_TAG_structure_type); - - SymbolContext sc; - bool new_type; - auto context_type_sp = parser->ParseTypeFromDWARF(sc, context_die, &new_type); - ASSERT_NE(context_type_sp, nullptr); - - ASSERT_TRUE( - parser->CompleteTypeFromDWARF(context_die, context_type_sp.get(), - context_type_sp->GetForwardCompilerType())); - - auto *record_decl = llvm::dyn_cast_or_null( - ClangUtil::GetAsTagDecl(context_type_sp->GetForwardCompilerType())); - ASSERT_NE(record_decl, nullptr); - - auto method_it = record_decl->method_begin(); - ASSERT_NE(method_it, record_decl->method_end()); - - // Check that we didn't parse the function as static. - EXPECT_FALSE(method_it->isStatic()); - - // Check that method qualifiers were correctly set. - EXPECT_EQ(method_it->getMethodQualifiers(), - clang::Qualifiers::fromCVRMask(clang::Qualifiers::Const | - clang::Qualifiers::Volatile)); -} From 05fbc3830d05878a0521a3e07aa1e469905ce732 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 23 Jan 2025 11:25:43 +0000 Subject: [PATCH 123/208] [VPlan] Move VPBlockUtils to VPlanUtils.h (NFC) Nothing in VPlan.h directly uses VPBlockUtils.h. Move it out to the more appropriate VPlanUtils.h to reduce the size of the widely included VPlan.h. --- llvm/lib/Transforms/Vectorize/VPlan.h | 144 ------------------- llvm/lib/Transforms/Vectorize/VPlanCFG.h | 1 + llvm/lib/Transforms/Vectorize/VPlanUtils.h | 152 ++++++++++++++++++++- 3 files changed, 151 insertions(+), 146 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index db45ad8aadbbe3..11ba7f06735134 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4202,150 +4202,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { } #endif -//===----------------------------------------------------------------------===// -// VPlan Utilities -//===----------------------------------------------------------------------===// - -/// Class that provides utilities for VPBlockBases in VPlan. -class VPBlockUtils { -public: - VPBlockUtils() = delete; - - /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p - /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p - /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's - /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must - /// have neither successors nor predecessors. - static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { - assert(NewBlock->getSuccessors().empty() && - NewBlock->getPredecessors().empty() && - "Can't insert new block with predecessors or successors."); - NewBlock->setParent(BlockPtr->getParent()); - SmallVector Succs(BlockPtr->successors()); - for (VPBlockBase *Succ : Succs) { - disconnectBlocks(BlockPtr, Succ); - connectBlocks(NewBlock, Succ); - } - connectBlocks(BlockPtr, NewBlock); - } - - /// Insert disconnected block \p NewBlock before \p Blockptr. First - /// disconnects all predecessors of \p BlockPtr and connects them to \p - /// NewBlock. Add \p NewBlock as predecessor of \p BlockPtr and \p BlockPtr as - /// successor of \p NewBlock. - static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { - assert(NewBlock->getSuccessors().empty() && - NewBlock->getPredecessors().empty() && - "Can't insert new block with predecessors or successors."); - NewBlock->setParent(BlockPtr->getParent()); - for (VPBlockBase *Pred : to_vector(BlockPtr->predecessors())) { - disconnectBlocks(Pred, BlockPtr); - connectBlocks(Pred, NewBlock); - } - connectBlocks(NewBlock, BlockPtr); - } - - /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p - /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p - /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr - /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors - /// and \p IfTrue and \p IfFalse must have neither successors nor - /// predecessors. - static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, - VPBlockBase *BlockPtr) { - assert(IfTrue->getSuccessors().empty() && - "Can't insert IfTrue with successors."); - assert(IfFalse->getSuccessors().empty() && - "Can't insert IfFalse with successors."); - BlockPtr->setTwoSuccessors(IfTrue, IfFalse); - IfTrue->setPredecessors({BlockPtr}); - IfFalse->setPredecessors({BlockPtr}); - IfTrue->setParent(BlockPtr->getParent()); - IfFalse->setParent(BlockPtr->getParent()); - } - - /// Connect VPBlockBases \p From and \p To bi-directionally. If \p PredIdx is - /// -1, append \p From to the predecessors of \p To, otherwise set \p To's - /// predecessor at \p PredIdx to \p From. If \p SuccIdx is -1, append \p To to - /// the successors of \p From, otherwise set \p From's successor at \p SuccIdx - /// to \p To. Both VPBlockBases must have the same parent, which can be null. - /// Both VPBlockBases can be already connected to other VPBlockBases. - static void connectBlocks(VPBlockBase *From, VPBlockBase *To, - unsigned PredIdx = -1u, unsigned SuccIdx = -1u) { - assert((From->getParent() == To->getParent()) && - "Can't connect two block with different parents"); - assert((SuccIdx != -1u || From->getNumSuccessors() < 2) && - "Blocks can't have more than two successors."); - if (SuccIdx == -1u) - From->appendSuccessor(To); - else - From->getSuccessors()[SuccIdx] = To; - - if (PredIdx == -1u) - To->appendPredecessor(From); - else - To->getPredecessors()[PredIdx] = From; - } - - /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To - /// from the successors of \p From and \p From from the predecessors of \p To. - static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) { - assert(To && "Successor to disconnect is null."); - From->removeSuccessor(To); - To->removePredecessor(From); - } - - /// Reassociate all the blocks connected to \p Old so that they now point to - /// \p New. - static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New) { - for (auto *Pred : to_vector(Old->getPredecessors())) - Pred->replaceSuccessor(Old, New); - for (auto *Succ : to_vector(Old->getSuccessors())) - Succ->replacePredecessor(Old, New); - New->setPredecessors(Old->getPredecessors()); - New->setSuccessors(Old->getSuccessors()); - Old->clearPredecessors(); - Old->clearSuccessors(); - } - - /// Return an iterator range over \p Range which only includes \p BlockTy - /// blocks. The accesses are casted to \p BlockTy. - template - static auto blocksOnly(const T &Range) { - // Create BaseTy with correct const-ness based on BlockTy. - using BaseTy = std::conditional_t::value, - const VPBlockBase, VPBlockBase>; - - // We need to first create an iterator range over (const) BlocktTy & instead - // of (const) BlockTy * for filter_range to work properly. - auto Mapped = - map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; }); - auto Filter = make_filter_range( - Mapped, [](BaseTy &Block) { return isa(&Block); }); - return map_range(Filter, [](BaseTy &Block) -> BlockTy * { - return cast(&Block); - }); - } - - /// Inserts \p BlockPtr on the edge between \p From and \p To. That is, update - /// \p From's successor to \p To to point to \p BlockPtr and \p To's - /// predecessor from \p From to \p BlockPtr. \p From and \p To are added to \p - /// BlockPtr's predecessors and successors respectively. There must be a - /// single edge between \p From and \p To. - static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, - VPBlockBase *BlockPtr) { - auto &Successors = From->getSuccessors(); - auto &Predecessors = To->getPredecessors(); - assert(count(Successors, To) == 1 && count(Predecessors, From) == 1 && - "must have single between From and To"); - unsigned SuccIdx = std::distance(Successors.begin(), find(Successors, To)); - unsigned PredIx = - std::distance(Predecessors.begin(), find(Predecessors, From)); - VPBlockUtils::connectBlocks(From, BlockPtr, -1, SuccIdx); - VPBlockUtils::connectBlocks(BlockPtr, To, PredIx, -1); - } -}; - class VPInterleavedAccessInfo { DenseMap *> InterleaveGroupMap; diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h index 6ca388a953a6ff..8fbdacd1ea7712 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h +++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h @@ -13,6 +13,7 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANCFG_H #include "VPlan.h" +#include "VPlanUtils.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index b88a1b14299754..6ddb88308955f1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -16,7 +16,9 @@ class ScalarEvolution; class SCEV; } // namespace llvm -namespace llvm::vputils { +namespace llvm { + +namespace vputils { /// Returns true if only the first lane of \p Def is used. bool onlyFirstLaneUsed(const VPValue *Def); @@ -67,6 +69,152 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan); /// VPDerivedIV or VPCanonicalIVPHI). bool isUniformAcrossVFsAndUFs(VPValue *V); -} // end namespace llvm::vputils +} // namespace vputils + +//===----------------------------------------------------------------------===// +// Utilities for modifying predecessors and successors of VPlan blocks. +//===----------------------------------------------------------------------===// + +/// Class that provides utilities for VPBlockBases in VPlan. +class VPBlockUtils { +public: + VPBlockUtils() = delete; + + /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p + /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p + /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's + /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must + /// have neither successors nor predecessors. + static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { + assert(NewBlock->getSuccessors().empty() && + NewBlock->getPredecessors().empty() && + "Can't insert new block with predecessors or successors."); + NewBlock->setParent(BlockPtr->getParent()); + SmallVector Succs(BlockPtr->successors()); + for (VPBlockBase *Succ : Succs) { + disconnectBlocks(BlockPtr, Succ); + connectBlocks(NewBlock, Succ); + } + connectBlocks(BlockPtr, NewBlock); + } + + /// Insert disconnected block \p NewBlock before \p Blockptr. First + /// disconnects all predecessors of \p BlockPtr and connects them to \p + /// NewBlock. Add \p NewBlock as predecessor of \p BlockPtr and \p BlockPtr as + /// successor of \p NewBlock. + static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { + assert(NewBlock->getSuccessors().empty() && + NewBlock->getPredecessors().empty() && + "Can't insert new block with predecessors or successors."); + NewBlock->setParent(BlockPtr->getParent()); + for (VPBlockBase *Pred : to_vector(BlockPtr->predecessors())) { + disconnectBlocks(Pred, BlockPtr); + connectBlocks(Pred, NewBlock); + } + connectBlocks(NewBlock, BlockPtr); + } + + /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p + /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p + /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr + /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors + /// and \p IfTrue and \p IfFalse must have neither successors nor + /// predecessors. + static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, + VPBlockBase *BlockPtr) { + assert(IfTrue->getSuccessors().empty() && + "Can't insert IfTrue with successors."); + assert(IfFalse->getSuccessors().empty() && + "Can't insert IfFalse with successors."); + BlockPtr->setTwoSuccessors(IfTrue, IfFalse); + IfTrue->setPredecessors({BlockPtr}); + IfFalse->setPredecessors({BlockPtr}); + IfTrue->setParent(BlockPtr->getParent()); + IfFalse->setParent(BlockPtr->getParent()); + } + + /// Connect VPBlockBases \p From and \p To bi-directionally. If \p PredIdx is + /// -1, append \p From to the predecessors of \p To, otherwise set \p To's + /// predecessor at \p PredIdx to \p From. If \p SuccIdx is -1, append \p To to + /// the successors of \p From, otherwise set \p From's successor at \p SuccIdx + /// to \p To. Both VPBlockBases must have the same parent, which can be null. + /// Both VPBlockBases can be already connected to other VPBlockBases. + static void connectBlocks(VPBlockBase *From, VPBlockBase *To, + unsigned PredIdx = -1u, unsigned SuccIdx = -1u) { + assert((From->getParent() == To->getParent()) && + "Can't connect two block with different parents"); + assert((SuccIdx != -1u || From->getNumSuccessors() < 2) && + "Blocks can't have more than two successors."); + if (SuccIdx == -1u) + From->appendSuccessor(To); + else + From->getSuccessors()[SuccIdx] = To; + + if (PredIdx == -1u) + To->appendPredecessor(From); + else + To->getPredecessors()[PredIdx] = From; + } + + /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To + /// from the successors of \p From and \p From from the predecessors of \p To. + static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) { + assert(To && "Successor to disconnect is null."); + From->removeSuccessor(To); + To->removePredecessor(From); + } + + /// Reassociate all the blocks connected to \p Old so that they now point to + /// \p New. + static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New) { + for (auto *Pred : to_vector(Old->getPredecessors())) + Pred->replaceSuccessor(Old, New); + for (auto *Succ : to_vector(Old->getSuccessors())) + Succ->replacePredecessor(Old, New); + New->setPredecessors(Old->getPredecessors()); + New->setSuccessors(Old->getSuccessors()); + Old->clearPredecessors(); + Old->clearSuccessors(); + } + + /// Return an iterator range over \p Range which only includes \p BlockTy + /// blocks. The accesses are casted to \p BlockTy. + template + static auto blocksOnly(const T &Range) { + // Create BaseTy with correct const-ness based on BlockTy. + using BaseTy = std::conditional_t::value, + const VPBlockBase, VPBlockBase>; + + // We need to first create an iterator range over (const) BlocktTy & instead + // of (const) BlockTy * for filter_range to work properly. + auto Mapped = + map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; }); + auto Filter = make_filter_range( + Mapped, [](BaseTy &Block) { return isa(&Block); }); + return map_range(Filter, [](BaseTy &Block) -> BlockTy * { + return cast(&Block); + }); + } + + /// Inserts \p BlockPtr on the edge between \p From and \p To. That is, update + /// \p From's successor to \p To to point to \p BlockPtr and \p To's + /// predecessor from \p From to \p BlockPtr. \p From and \p To are added to \p + /// BlockPtr's predecessors and successors respectively. There must be a + /// single edge between \p From and \p To. + static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, + VPBlockBase *BlockPtr) { + auto &Successors = From->getSuccessors(); + auto &Predecessors = To->getPredecessors(); + assert(count(Successors, To) == 1 && count(Predecessors, From) == 1 && + "must have single between From and To"); + unsigned SuccIdx = std::distance(Successors.begin(), find(Successors, To)); + unsigned PredIx = + std::distance(Predecessors.begin(), find(Predecessors, From)); + VPBlockUtils::connectBlocks(From, BlockPtr, -1, SuccIdx); + VPBlockUtils::connectBlocks(BlockPtr, To, PredIx, -1); + } +}; + +} // namespace llvm #endif From 4bcdb26dac4cdadd7f8850a5f9b2e775b73aaf7f Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 23 Jan 2025 11:29:06 +0000 Subject: [PATCH 124/208] Revert "[lldb][test] Remove compiler version check and use regex" (#124101) Reverts llvm/llvm-project#123393 This is causing `TestVectorOfVectorsFromStdModule.py` to fail on the the macOS clang-15 matrix bot. --- .../TestDbgInfoContentVectorFromStdModule.py | 22 +++++---- .../TestVectorOfVectorsFromStdModule.py | 46 +++++++++++++------ 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py index 759077302bfca4..1c32222e64f14c 100644 --- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py +++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py @@ -23,6 +23,13 @@ def test(self): self.runCmd("settings set target.import-std-module true") + if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion( + [">", "16.0"] + ): + vector_type = "std::vector" + else: + vector_type = "std::vector >" + size_type = "size_type" value_type = "value_type" iterator = "iterator" @@ -34,14 +41,13 @@ def test(self): ValueCheck(name="current"), ] - self.expect( - "expr a", - patterns=[ - """\(std::vector )*>\) \$0 = size=3 \{ - \[0\] = \(a = 3\) - \[1\] = \(a = 1\) - \[2\] = \(a = 2\) -\}""" + self.expect_expr( + "a", + result_type=vector_type, + result_children=[ + ValueCheck(children=[ValueCheck(value="3")]), + ValueCheck(children=[ValueCheck(value="1")]), + ValueCheck(children=[ValueCheck(value="2")]), ], ) diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py index e18785ec1359cc..a1f33271f39d2f 100644 --- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py +++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py @@ -17,26 +17,42 @@ def test(self): self, "// Set break point at this line.", lldb.SBFileSpec("main.cpp") ) + if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion( + [">", "16.0"] + ): + vector_type = "std::vector" + vector_of_vector_type = "std::vector >" + else: + vector_type = "std::vector" + vector_of_vector_type = ( + "std::vector, std::allocator > >" + ) + size_type = "size_type" value_type = "value_type" self.runCmd("settings set target.import-std-module true") - self.expect( - "expr a", - patterns=[ - """\(std::vector(, std::allocator )* >\) \$0 = size=2 \{ - \[0\] = size=3 \{ - \[0\] = 1 - \[1\] = 2 - \[2\] = 3 - \} - \[1\] = size=3 \{ - \[0\] = 3 - \[1\] = 2 - \[2\] = 1 - \} -\}""" + self.expect_expr( + "a", + result_type=vector_of_vector_type, + result_children=[ + ValueCheck( + type=vector_type, + children=[ + ValueCheck(value="1"), + ValueCheck(value="2"), + ValueCheck(value="3"), + ], + ), + ValueCheck( + type=vector_type, + children=[ + ValueCheck(value="3"), + ValueCheck(value="2"), + ValueCheck(value="1"), + ], + ), ], ) self.expect_expr("a.size()", result_type=size_type, result_value="2") From 4f26edd5e9eb3b6cea19e15ca8fb2c8416b82fa8 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 23 Jan 2025 03:56:51 -0800 Subject: [PATCH 125/208] [NFC][YAML] Add `IO::error()` (#123475) For #123280 --- llvm/include/llvm/Support/YAMLTraits.h | 4 +++- llvm/lib/Support/YAMLTraits.cpp | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h index eca26e90845bf6..e707a445012b51 100644 --- a/llvm/include/llvm/Support/YAMLTraits.h +++ b/llvm/include/llvm/Support/YAMLTraits.h @@ -819,6 +819,7 @@ class IO { virtual NodeKind getNodeKind() = 0; virtual void setError(const Twine &) = 0; + virtual std::error_code error() = 0; virtual void setAllowUnknownKeys(bool Allow); template @@ -1448,7 +1449,7 @@ class Input : public IO { ~Input() override; // Check if there was an syntax or semantic error during parsing. - std::error_code error(); + std::error_code error() override; private: bool outputting() const override; @@ -1631,6 +1632,7 @@ class Output : public IO { void scalarTag(std::string &) override; NodeKind getNodeKind() override; void setError(const Twine &message) override; + std::error_code error() override; bool canElideEmptySequence() override; // These are only used by operator<<. They could be private diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp index f326422138488c..28642e004c4f43 100644 --- a/llvm/lib/Support/YAMLTraits.cpp +++ b/llvm/lib/Support/YAMLTraits.cpp @@ -750,6 +750,8 @@ void Output::scalarTag(std::string &Tag) { void Output::setError(const Twine &message) { } +std::error_code Output::error() { return {}; } + bool Output::canElideEmptySequence() { // Normally, with an optional key/value where the value is an empty sequence, // the whole key/value can be not written. But, that produces wrong yaml From 1311b36acea0ac0d94c23452fcb0109bb18373cb Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 23 Jan 2025 03:57:19 -0800 Subject: [PATCH 126/208] [llvm][Support] Put back filename into FileToRemoveList (#124065) Prevents avoidable memory leaks. Looks like exchange added in aa1333a91f8d8a060bcf5b14aa32a6e8bab74e8c didn't take "continue" into account. ``` ==llc==2150782==ERROR: LeakSanitizer: detected memory leaks Direct leak of 10 byte(s) in 1 object(s) allocated from: #0 0x5f1b0f9ac14a in strdup llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:593:3 #1 0x5f1b1768428d in FileToRemoveList llvm-project/llvm/lib/Support/Unix/Signals.inc:105:55 ``` --- llvm/lib/Support/Unix/Signals.inc | 34 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index 330b5d26fa50be..9a12663228a368 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -149,6 +149,24 @@ public: } } + static void removeFile(char *path) { + // Get the status so we can determine if it's a file or directory. If we + // can't stat the file, ignore it. + struct stat buf; + if (stat(path, &buf) != 0) + return; + + // If this is not a regular file, ignore it. We want to prevent removal + // of special files like /dev/null, even if the compiler is being run + // with the super-user permissions. + if (!S_ISREG(buf.st_mode)) + return; + + // Otherwise, remove the file. We ignore any errors here as there is + // nothing else we can do. + unlink(path); + } + // Signal-safe. static void removeAllFiles(std::atomic &Head) { // If cleanup were to occur while we're removing files we'd have a bad time. @@ -162,21 +180,7 @@ public: // If erasing was occuring while we're trying to remove files we'd look // at free'd data. Take away the path and put it back when done. if (char *path = currentFile->Filename.exchange(nullptr)) { - // Get the status so we can determine if it's a file or directory. If we - // can't stat the file, ignore it. - struct stat buf; - if (stat(path, &buf) != 0) - continue; - - // If this is not a regular file, ignore it. We want to prevent removal - // of special files like /dev/null, even if the compiler is being run - // with the super-user permissions. - if (!S_ISREG(buf.st_mode)) - continue; - - // Otherwise, remove the file. We ignore any errors here as there is - // nothing else we can do. - unlink(path); + removeFile(path); // We're done removing the file, erasing can safely proceed. currentFile->Filename.exchange(path); From 636bc72f672712cb848729c0f130d8b42c86f1cb Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 23 Jan 2025 11:34:33 +0000 Subject: [PATCH 127/208] Reland "[lldb][DWARFASTParserClang] Make C++ method parsing aware of explicit object parameters" (#124100)" This reverts commit a8020930a8174d84da04fa91b6fef244207f42f5. Relands original commit but fixing the unit-test to consume the `llvm::Expected` error object. --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 9 +- .../DWARF/DWARFASTParserClangTests.cpp | 180 ++++++++++++++++++ 2 files changed, 183 insertions(+), 6 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index f54b7fc9cdad24..682ee6d287bf5c 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -173,7 +173,9 @@ GetCXXObjectParameter(const DWARFDIE &subprogram, if (!DeclKindIsCXXClass(containing_decl_ctx.getDeclKind())) return {}; - // FIXME: if subprogram has a explicit DW_AT_object_pointer, use it. + if (DWARFDIE object_parameter = + subprogram.GetAttributeValueAsReferenceDIE(DW_AT_object_pointer)) + return object_parameter; // If no DW_AT_object_pointer was specified, assume the implicit object // parameter is the first parameter to the function, is called "this" and is @@ -215,11 +217,6 @@ static unsigned GetCXXMethodCVQuals(const DWARFDIE &subprogram, return 0; uint32_t encoding_mask = this_type->GetEncodingMask(); - - // FIXME: explicit object parameters need not to be pointers - if (!(encoding_mask & (1u << Type::eEncodingIsPointerUID))) - return 0; - unsigned cv_quals = 0; if (encoding_mask & (1u << Type::eEncodingIsConstUID)) cv_quals |= clang::Qualifiers::Const; diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index b31f56aa372d58..8adda6fba3a0b0 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -902,3 +902,183 @@ TEST_F(DWARFASTParserClangTests, TestParseDWARFAttributes_ObjectPointer) { EXPECT_TRUE(attrs.object_pointer.IsValid()); EXPECT_EQ(attrs.object_pointer, param_die); } + +TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ExplicitObjectParameter) { + // Tests parsing of a C++ non-static member function with an explicit object + // parameter that isn't called "this" and is not a pointer (but a CV-qualified + // rvalue reference instead). + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - Context + - func + - mySelf + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_object_pointer + Form: DW_FORM_ref4 + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Code: 0x4 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x5 + Tag: DW_TAG_rvalue_reference_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x6 + Tag: DW_TAG_const_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x7 + Tag: DW_TAG_volatile_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_structure_type +# DW_AT_name [DW_FORM_strp] ("Context") + + - AbbrCode: 0x2 + Values: + - Value: 0x0 + +# DW_TAG_subprogram +# DW_AT_name [DW_FORM_strp] ("func") +# DW_AT_object_pointer [DW_FORM_ref4] + - AbbrCode: 0x3 + Values: + - Value: 0x8 + - Value: 0x1 + - Value: 0x1d + - Value: 0x1 + +# DW_TAG_formal_parameter +# DW_AT_name [DW_FORM_strp] ("mySelf") +# DW_AT_type [DW_FORM_ref4] (const volatile Context &&) + - AbbrCode: 0x4 + Values: + - Value: 0xd + - Value: 0x28 + + - AbbrCode: 0x0 + - AbbrCode: 0x0 + +# DW_TAG_rvalue_reference_type +# DW_AT_type [DW_FORM_ref4] ("const volatile Context") + + - AbbrCode: 0x5 + Values: + - Value: 0x2d + +# DW_TAG_const_type +# DW_AT_type [DW_FORM_ref4] ("volatile Context") + + - AbbrCode: 0x6 + Values: + - Value: 0x32 + +# DW_TAG_volatile_type +# DW_AT_type [DW_FORM_ref4] ("Context") + + - AbbrCode: 0x7 + Values: + - Value: 0xf + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto ts_or_err = + cu_die.GetDWARF()->GetTypeSystemForLanguage(eLanguageTypeC_plus_plus); + ASSERT_TRUE(static_cast(ts_or_err)); + llvm::consumeError(ts_or_err.takeError()); + auto *parser = + static_cast((*ts_or_err)->GetDWARFParser()); + + auto context_die = cu_die.GetFirstChild(); + ASSERT_TRUE(context_die.IsValid()); + ASSERT_EQ(context_die.Tag(), DW_TAG_structure_type); + + SymbolContext sc; + bool new_type; + auto context_type_sp = parser->ParseTypeFromDWARF(sc, context_die, &new_type); + ASSERT_NE(context_type_sp, nullptr); + + ASSERT_TRUE( + parser->CompleteTypeFromDWARF(context_die, context_type_sp.get(), + context_type_sp->GetForwardCompilerType())); + + auto *record_decl = llvm::dyn_cast_or_null( + ClangUtil::GetAsTagDecl(context_type_sp->GetForwardCompilerType())); + ASSERT_NE(record_decl, nullptr); + + auto method_it = record_decl->method_begin(); + ASSERT_NE(method_it, record_decl->method_end()); + + // Check that we didn't parse the function as static. + EXPECT_FALSE(method_it->isStatic()); + + // Check that method qualifiers were correctly set. + EXPECT_EQ(method_it->getMethodQualifiers(), + clang::Qualifiers::fromCVRMask(clang::Qualifiers::Const | + clang::Qualifiers::Volatile)); +} From 3ea2b546a8d17014d3ecf05356ecfaadf26ed846 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 23 Jan 2025 13:03:29 +0100 Subject: [PATCH 128/208] [lldb/windows] Make "anonymous" pipe names more unique (#123905) Using a "random" name for an "anonymous" pipe seems to be the state of the art on windows (according to stack overflow, new windows versions may have something better, but it involves calling kernel APIs directly and generally a lot of dark magic). The problem with the current method was that is does not produce unique names if one has two copies of the pipe code in the same process, which is what happened with #120457 (because liblldb only exposes the public api, and we've started using the pipe code in lldb-dap as well). This patch works around the problem by adding the address of the counter variable to the pipe name. Replicating the multiple-copies setup in a test would be very difficult, which is why I'm not adding a test for this scenario. --- lldb/source/Host/windows/PipeWindows.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lldb/source/Host/windows/PipeWindows.cpp b/lldb/source/Host/windows/PipeWindows.cpp index 21e30f0ae87384..e95007ae8fd163 100644 --- a/lldb/source/Host/windows/PipeWindows.cpp +++ b/lldb/source/Host/windows/PipeWindows.cpp @@ -71,9 +71,8 @@ Status PipeWindows::CreateNew(bool child_process_inherit) { // cannot get overlapped i/o on Windows without using a named pipe. So we // synthesize a unique name. uint32_t serial = g_pipe_serial.fetch_add(1); - std::string pipe_name; - llvm::raw_string_ostream pipe_name_stream(pipe_name); - pipe_name_stream << "lldb.pipe." << ::GetCurrentProcessId() << "." << serial; + std::string pipe_name = llvm::formatv( + "lldb.pipe.{0}.{1}.{2}", GetCurrentProcessId(), &g_pipe_serial, serial); return CreateNew(pipe_name.c_str(), child_process_inherit); } From 0236cb689550ed2dac406443c652efb723cb2602 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 23 Jan 2025 13:04:36 +0100 Subject: [PATCH 129/208] [lldb] Enable "frame diagnose" on linux (#123217) .. by changing the signal stop reason format :facepalm: The reason this did not work is because the code in `StopInfo::GetCrashingDereference` was looking for the string "address=" to extract the address of the crash. Macos stop reason strings have the form ``` EXC_BAD_ACCESS (code=1, address=0xdead) ``` while on linux they look like: ``` signal SIGSEGV: address not mapped to object (fault address: 0xdead) ``` Extracting the address from a string sounds like a bad idea, but I suppose there's some value in using a consistent format across platforms, so this patch changes the signal format to use the equals sign as well. All of the diagnose tests pass except one, which appears to fail due to something similar #115453 (disassembler reports unrelocated call targets). I've left the tests disabled on windows, as the stop reason reporting code works very differently there, and I suspect it won't work out of the box. If I'm wrong -- the XFAIL will let us know. --- .../Process/Linux/NativeThreadLinux.cpp | 6 +++--- lldb/source/Target/UnixSignals.cpp | 8 ++++---- .../frame/diagnose/array/TestArray.py | 2 +- .../bad-reference/TestBadReference.py | 2 +- .../TestComplicatedExpression.py | 2 +- .../TestDiagnoseDereferenceArgument.py | 2 +- .../TestDiagnoseDereferenceFunctionReturn.py | 2 +- .../TestDiagnoseDereferenceThis.py | 2 +- .../inheritance/TestDiagnoseInheritance.py | 2 +- .../local-variable/TestLocalVariable.py | 2 +- ...estDiagnoseDereferenceVirtualMethodCall.py | 2 +- .../TestAArch64LinuxMTEMemoryTagCoreFile.py | 2 +- .../TestAArch64LinuxMTEMemoryTagFaults.py | 4 ++-- ...stAArch64LinuxNonAddressBitMemoryAccess.py | 2 +- .../Core/x86-32-linux-multithread.test | 2 +- .../Core/x86-64-linux-multithread.test | 2 +- lldb/unittests/Signals/UnixSignalsTest.cpp | 10 +++++----- llvm/docs/ReleaseNotes.md | 20 ++++++++++++++++++- 18 files changed, 46 insertions(+), 28 deletions(-) diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp index de047ee214c11e..a6d6a78357fe5c 100644 --- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp @@ -326,14 +326,14 @@ void NativeThreadLinux::AnnotateSyncTagCheckFault(lldb::addr_t fault_addr) { } // We assume that the stop description is currently: - // signal SIGSEGV: sync tag check fault (fault address: ) + // signal SIGSEGV: sync tag check fault (fault address=) // Remove the closing ) m_stop_description.pop_back(); std::stringstream ss; std::unique_ptr manager(std::move(details->manager)); - ss << " logical tag: 0x" << std::hex << manager->GetLogicalTag(fault_addr); + ss << " logical tag=0x" << std::hex << manager->GetLogicalTag(fault_addr); std::vector allocation_tag_data; // The fault address may not be granule aligned. ReadMemoryTags will granule @@ -347,7 +347,7 @@ void NativeThreadLinux::AnnotateSyncTagCheckFault(lldb::addr_t fault_addr) { llvm::Expected> allocation_tag = manager->UnpackTagsData(allocation_tag_data, 1); if (allocation_tag) { - ss << " allocation tag: 0x" << std::hex << allocation_tag->front() << ")"; + ss << " allocation tag=0x" << std::hex << allocation_tag->front() << ")"; } else { llvm::consumeError(allocation_tag.takeError()); ss << ")"; diff --git a/lldb/source/Target/UnixSignals.cpp b/lldb/source/Target/UnixSignals.cpp index bee3a63818259e..da661003925c79 100644 --- a/lldb/source/Target/UnixSignals.cpp +++ b/lldb/source/Target/UnixSignals.cpp @@ -163,7 +163,7 @@ UnixSignals::GetSignalDescription(int32_t signo, std::optional code, break; case SignalCodePrintOption::Address: if (addr) - strm << " (fault address: 0x" << std::hex << *addr << ")"; + strm << " (fault address=0x" << std::hex << *addr << ")"; break; case SignalCodePrintOption::Bounds: if (lower && upper && addr) { @@ -172,9 +172,9 @@ UnixSignals::GetSignalDescription(int32_t signo, std::optional code, else strm << "upper bound violation "; - strm << "(fault address: 0x" << std::hex << *addr; - strm << ", lower bound: 0x" << std::hex << *lower; - strm << ", upper bound: 0x" << std::hex << *upper; + strm << "(fault address=0x" << std::hex << *addr; + strm << ", lower bound=0x" << std::hex << *lower; + strm << ", upper bound=0x" << std::hex << *upper; strm << ")"; } else strm << sc.m_description.str(); diff --git a/lldb/test/API/commands/frame/diagnose/array/TestArray.py b/lldb/test/API/commands/frame/diagnose/array/TestArray.py index 5de6f7b0aaa1c4..307e2cbca30228 100644 --- a/lldb/test/API/commands/frame/diagnose/array/TestArray.py +++ b/lldb/test/API/commands/frame/diagnose/array/TestArray.py @@ -10,7 +10,7 @@ class TestArray(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/bad-reference/TestBadReference.py b/lldb/test/API/commands/frame/diagnose/bad-reference/TestBadReference.py index 7a9498cab13766..8ded5e2ff55c21 100644 --- a/lldb/test/API/commands/frame/diagnose/bad-reference/TestBadReference.py +++ b/lldb/test/API/commands/frame/diagnose/bad-reference/TestBadReference.py @@ -10,7 +10,7 @@ class TestBadReference(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/complicated-expression/TestComplicatedExpression.py b/lldb/test/API/commands/frame/diagnose/complicated-expression/TestComplicatedExpression.py index eb1b556f0c408d..8ee254a28cb545 100644 --- a/lldb/test/API/commands/frame/diagnose/complicated-expression/TestComplicatedExpression.py +++ b/lldb/test/API/commands/frame/diagnose/complicated-expression/TestComplicatedExpression.py @@ -10,7 +10,7 @@ class TestDiagnoseDereferenceArgument(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/dereference-argument/TestDiagnoseDereferenceArgument.py b/lldb/test/API/commands/frame/diagnose/dereference-argument/TestDiagnoseDereferenceArgument.py index 4e867354343879..960dd99ff7f78b 100644 --- a/lldb/test/API/commands/frame/diagnose/dereference-argument/TestDiagnoseDereferenceArgument.py +++ b/lldb/test/API/commands/frame/diagnose/dereference-argument/TestDiagnoseDereferenceArgument.py @@ -10,7 +10,7 @@ class TestDiagnoseDereferenceArgument(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py index 4d9b036f5102cb..d0f6ebefa334ad 100644 --- a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py +++ b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py @@ -10,7 +10,7 @@ class TestDiagnoseDereferenceFunctionReturn(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=no_match(lldbplatformutil.getDarwinOSTriples())) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/dereference-this/TestDiagnoseDereferenceThis.py b/lldb/test/API/commands/frame/diagnose/dereference-this/TestDiagnoseDereferenceThis.py index fccba5ca116a9a..7a4d3fb2acb5c6 100644 --- a/lldb/test/API/commands/frame/diagnose/dereference-this/TestDiagnoseDereferenceThis.py +++ b/lldb/test/API/commands/frame/diagnose/dereference-this/TestDiagnoseDereferenceThis.py @@ -10,7 +10,7 @@ class TestDiagnoseDereferenceThis(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/inheritance/TestDiagnoseInheritance.py b/lldb/test/API/commands/frame/diagnose/inheritance/TestDiagnoseInheritance.py index 01245ff7608e1b..71a24002a06274 100644 --- a/lldb/test/API/commands/frame/diagnose/inheritance/TestDiagnoseInheritance.py +++ b/lldb/test/API/commands/frame/diagnose/inheritance/TestDiagnoseInheritance.py @@ -10,7 +10,7 @@ class TestDiagnoseInheritance(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/local-variable/TestLocalVariable.py b/lldb/test/API/commands/frame/diagnose/local-variable/TestLocalVariable.py index 9361d80367e128..2db054bec99190 100644 --- a/lldb/test/API/commands/frame/diagnose/local-variable/TestLocalVariable.py +++ b/lldb/test/API/commands/frame/diagnose/local-variable/TestLocalVariable.py @@ -10,7 +10,7 @@ class TestLocalVariable(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/commands/frame/diagnose/virtual-method-call/TestDiagnoseDereferenceVirtualMethodCall.py b/lldb/test/API/commands/frame/diagnose/virtual-method-call/TestDiagnoseDereferenceVirtualMethodCall.py index 7a58203d8f2ed5..ef99b72f52afde 100644 --- a/lldb/test/API/commands/frame/diagnose/virtual-method-call/TestDiagnoseDereferenceVirtualMethodCall.py +++ b/lldb/test/API/commands/frame/diagnose/virtual-method-call/TestDiagnoseDereferenceVirtualMethodCall.py @@ -10,7 +10,7 @@ class TestDiagnoseVirtualMethodCall(TestBase): - @skipUnlessDarwin + @expectedFailureAll(oslist=["windows"]) @skipIf( archs=no_match(["x86_64"]) ) # frame diagnose doesn't work for armv7 or arm64 diff --git a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py index 779050edb054a4..6309648819026a 100644 --- a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py +++ b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py @@ -215,7 +215,7 @@ def test_mte_tag_fault_reason(self): self.expect( "bt", substrs=[ - "* thread #1, name = 'a.out.mte', stop reason = SIGSEGV: sync tag check fault (fault address: 0xffff82c74010)" + "* thread #1, name = 'a.out.mte', stop reason = SIGSEGV: sync tag check fault (fault address=0xffff82c74010)" ], ) diff --git a/lldb/test/API/linux/aarch64/mte_tag_faults/TestAArch64LinuxMTEMemoryTagFaults.py b/lldb/test/API/linux/aarch64/mte_tag_faults/TestAArch64LinuxMTEMemoryTagFaults.py index 420aae48234888..2d6470505cf7c4 100644 --- a/lldb/test/API/linux/aarch64/mte_tag_faults/TestAArch64LinuxMTEMemoryTagFaults.py +++ b/lldb/test/API/linux/aarch64/mte_tag_faults/TestAArch64LinuxMTEMemoryTagFaults.py @@ -51,8 +51,8 @@ def test_mte_tag_fault_sync(self): "continue", patterns=[ "\* thread #1, name = 'a.out', stop reason = signal SIGSEGV: " - "sync tag check fault \(fault address: 0x9[0-9A-Fa-f]+11\ " - "logical tag: 0x9 allocation tag: 0xa\)" + "sync tag check fault \(fault address=0x9[0-9A-Fa-f]+11\ " + "logical tag=0x9 allocation tag=0xa\)" ], ) diff --git a/lldb/test/API/linux/aarch64/non_address_bit_memory_access/TestAArch64LinuxNonAddressBitMemoryAccess.py b/lldb/test/API/linux/aarch64/non_address_bit_memory_access/TestAArch64LinuxNonAddressBitMemoryAccess.py index 668fca11903660..f27780358570bb 100644 --- a/lldb/test/API/linux/aarch64/non_address_bit_memory_access/TestAArch64LinuxNonAddressBitMemoryAccess.py +++ b/lldb/test/API/linux/aarch64/non_address_bit_memory_access/TestAArch64LinuxNonAddressBitMemoryAccess.py @@ -202,7 +202,7 @@ def test_non_address_bit_memory_corefile(self): "thread list", substrs=[ "stopped", - "stop reason = SIGSEGV: address not mapped to object (fault address: 0x0)", + "stop reason = SIGSEGV: address not mapped to object (fault address=0x0)", ], ) diff --git a/lldb/test/Shell/Register/Core/x86-32-linux-multithread.test b/lldb/test/Shell/Register/Core/x86-32-linux-multithread.test index eb0cf8708263cb..972e10844a5aad 100644 --- a/lldb/test/Shell/Register/Core/x86-32-linux-multithread.test +++ b/lldb/test/Shell/Register/Core/x86-32-linux-multithread.test @@ -1,7 +1,7 @@ # RUN: %lldb -b -s %s -c %p/Inputs/x86-32-linux-multithread.core | FileCheck %s thread list -# CHECK: * thread #1: tid = 330633, 0x080492d2, name = 'a.out', stop reason = SIGSEGV: address not mapped to object (fault address: 0x0) +# CHECK: * thread #1: tid = 330633, 0x080492d2, name = 'a.out', stop reason = SIGSEGV: address not mapped to object (fault address=0x0) # CHECK-NEXT: thread #2: tid = 330634, 0x080492dd, stop reason = signal 0 # CHECK-NEXT: thread #3: tid = 330635, 0x080492dd, stop reason = signal 0 # CHECK-NEXT: thread #4: tid = 330632, 0xf7f59549, stop reason = signal 0 diff --git a/lldb/test/Shell/Register/Core/x86-64-linux-multithread.test b/lldb/test/Shell/Register/Core/x86-64-linux-multithread.test index a94a4de1c8080b..5bea84813b44fe 100644 --- a/lldb/test/Shell/Register/Core/x86-64-linux-multithread.test +++ b/lldb/test/Shell/Register/Core/x86-64-linux-multithread.test @@ -1,7 +1,7 @@ # RUN: %lldb -b -s %s -c %p/Inputs/x86-64-linux-multithread.core | FileCheck %s thread list -# CHECK: * thread #1: tid = 329384, 0x0000000000401262, name = 'a.out', stop reason = SIGSEGV: address not mapped to object (fault address: 0x0) +# CHECK: * thread #1: tid = 329384, 0x0000000000401262, name = 'a.out', stop reason = SIGSEGV: address not mapped to object (fault address=0x0) # CHECK-NEXT: thread #2: tid = 329385, 0x000000000040126d, stop reason = signal 0 # CHECK-NEXT: thread #3: tid = 329386, 0x000000000040126d, stop reason = signal 0 # CHECK-NEXT: thread #4: tid = 329383, 0x00007fcf5582f762, stop reason = signal 0 diff --git a/lldb/unittests/Signals/UnixSignalsTest.cpp b/lldb/unittests/Signals/UnixSignalsTest.cpp index acd39286922501..9a7d9afc2b1859 100644 --- a/lldb/unittests/Signals/UnixSignalsTest.cpp +++ b/lldb/unittests/Signals/UnixSignalsTest.cpp @@ -119,7 +119,7 @@ TEST(UnixSignalsTest, GetAsString) { ASSERT_EQ("SIG16: a specific type of SIG16", signals.GetSignalDescription(16, 1, 0xCAFEF00D)); // Known code that should. - ASSERT_EQ("SIG16: SIG16 with a fault address (fault address: 0xcafef00d)", + ASSERT_EQ("SIG16: SIG16 with a fault address (fault address=0xcafef00d)", signals.GetSignalDescription(16, 2, 0xCAFEF00D)); // No address given just print the code description. ASSERT_EQ("SIG16: SIG16 with a fault address", @@ -131,11 +131,11 @@ TEST(UnixSignalsTest, GetAsString) { ASSERT_EQ(expected, signals.GetSignalDescription(16, 3, 0xcafef00d)); ASSERT_EQ(expected, signals.GetSignalDescription(16, 3, 0xcafef00d, 0x1234)); - ASSERT_EQ("SIG16: upper bound violation (fault address: 0x5679, lower bound: " - "0x1234, upper bound: 0x5678)", + ASSERT_EQ("SIG16: upper bound violation (fault address=0x5679, lower bound=" + "0x1234, upper bound=0x5678)", signals.GetSignalDescription(16, 3, 0x5679, 0x1234, 0x5678)); - ASSERT_EQ("SIG16: lower bound violation (fault address: 0x1233, lower bound: " - "0x1234, upper bound: 0x5678)", + ASSERT_EQ("SIG16: lower bound violation (fault address=0x1233, lower bound=" + "0x1234, upper bound=0x5678)", signals.GetSignalDescription(16, 3, 0x1233, 0x1234, 0x5678)); } diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index eb6e9c9b75beb5..05d902641d0933 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -532,7 +532,7 @@ Changes to LLDB New: ``` - * thread #1: tid = 329384, 0x0000000000401262, name = 'a.out', stop reason = SIGSEGV: address not mapped to object (fault address: 0x0) + * thread #1: tid = 329384, 0x0000000000401262, name = 'a.out', stop reason = SIGSEGV: address not mapped to object (fault address=0x0) 0x7f1e3193e0a7 <+23>: ja 0x7f1e3193e100 ; <+112> ``` @@ -555,6 +555,24 @@ Changes to LLDB * Incorrect floating-point register dwarf number for LoongArch is [fixed](https://github.com/llvm/llvm-project/pull/120391). +* The `frame diagnose` now works on ELF-based systems. After a crash, LLDB will + try to determine the likely cause of the signal, matching Darwin behavior. + This feature requires using a new `lldb-server` version and (like Darwin) only + works on x86 binaries. + + ``` + * thread #1, name = 'a.out', stop reason = signal SIGSEGV: address not mapped to object (fault address=0x4) + frame #0: 0x00005555555551aa a.out`GetSum(f=0x0000555555558018) at main.c:21:37 + 18 } + 19 + 20 int GetSum(struct Foo *f) { + -> 21 return SumTwoIntegers(f->a, f->b->d ? 0 : 1); + 22 } + 23 + 24 int main() { + Likely cause: f->b->d accessed 0x4 + ``` + Changes to BOLT --------------------------------- From 6f684816e25d8b4e5fb2cbc7d0560d608a8bd938 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 23 Jan 2025 13:18:54 +0100 Subject: [PATCH 130/208] [libc++] Use [[clang::no_specializations]] to diagnose invalid user specializations (#118167) Some templates in the standard library are illegal to specialize for users (even if the specialization contains user-defined types). The [[clang::no_specializations]] attribute allows marking such base templates so that the compiler will diagnose if users try adding a specialization. --- .../__compare/compare_three_way_result.h | 3 +- libcxx/include/__config | 7 + libcxx/include/__format/format_arg.h | 2 +- libcxx/include/__ranges/range_adaptor.h | 2 +- libcxx/include/__type_traits/add_cv_quals.h | 6 +- .../__type_traits/add_lvalue_reference.h | 2 +- libcxx/include/__type_traits/add_pointer.h | 2 +- .../__type_traits/add_rvalue_reference.h | 2 +- .../include/__type_traits/aligned_storage.h | 2 +- libcxx/include/__type_traits/aligned_union.h | 2 +- libcxx/include/__type_traits/alignment_of.h | 5 +- libcxx/include/__type_traits/conditional.h | 8 +- libcxx/include/__type_traits/conjunction.h | 9 +- libcxx/include/__type_traits/decay.h | 2 +- libcxx/include/__type_traits/disjunction.h | 4 +- libcxx/include/__type_traits/enable_if.h | 8 +- libcxx/include/__type_traits/extent.h | 4 +- .../has_unique_object_representation.h | 5 +- .../__type_traits/has_virtual_destructor.h | 5 +- .../include/__type_traits/integral_constant.h | 2 +- libcxx/include/__type_traits/invoke.h | 23 ++- libcxx/include/__type_traits/is_abstract.h | 5 +- libcxx/include/__type_traits/is_aggregate.h | 5 +- libcxx/include/__type_traits/is_arithmetic.h | 4 +- libcxx/include/__type_traits/is_array.h | 4 +- libcxx/include/__type_traits/is_assignable.h | 12 +- libcxx/include/__type_traits/is_base_of.h | 10 +- .../include/__type_traits/is_bounded_array.h | 10 +- libcxx/include/__type_traits/is_class.h | 4 +- libcxx/include/__type_traits/is_compound.h | 4 +- libcxx/include/__type_traits/is_const.h | 4 +- .../include/__type_traits/is_constructible.h | 18 +- libcxx/include/__type_traits/is_convertible.h | 5 +- .../include/__type_traits/is_destructible.h | 4 +- libcxx/include/__type_traits/is_empty.h | 4 +- libcxx/include/__type_traits/is_enum.h | 8 +- .../__type_traits/is_execution_policy.h | 2 +- libcxx/include/__type_traits/is_final.h | 4 +- .../include/__type_traits/is_floating_point.h | 5 +- libcxx/include/__type_traits/is_function.h | 4 +- libcxx/include/__type_traits/is_fundamental.h | 4 +- .../__type_traits/is_implicit_lifetime.h | 5 +- libcxx/include/__type_traits/is_integral.h | 4 +- .../include/__type_traits/is_literal_type.h | 6 +- .../include/__type_traits/is_member_pointer.h | 14 +- .../__type_traits/is_nothrow_assignable.h | 14 +- .../__type_traits/is_nothrow_constructible.h | 19 +- .../__type_traits/is_nothrow_convertible.h | 4 +- .../__type_traits/is_nothrow_destructible.h | 5 +- .../include/__type_traits/is_null_pointer.h | 5 +- libcxx/include/__type_traits/is_object.h | 4 +- libcxx/include/__type_traits/is_pod.h | 4 +- libcxx/include/__type_traits/is_pointer.h | 4 +- libcxx/include/__type_traits/is_polymorphic.h | 5 +- libcxx/include/__type_traits/is_reference.h | 14 +- libcxx/include/__type_traits/is_same.h | 4 +- libcxx/include/__type_traits/is_scalar.h | 4 +- libcxx/include/__type_traits/is_signed.h | 4 +- .../__type_traits/is_standard_layout.h | 5 +- libcxx/include/__type_traits/is_swappable.h | 19 +- libcxx/include/__type_traits/is_trivial.h | 5 +- .../__type_traits/is_trivially_assignable.h | 15 +- .../is_trivially_constructible.h | 20 +- .../__type_traits/is_trivially_copyable.h | 5 +- .../__type_traits/is_trivially_destructible.h | 4 +- .../__type_traits/is_unbounded_array.h | 10 +- libcxx/include/__type_traits/is_union.h | 4 +- libcxx/include/__type_traits/is_unsigned.h | 4 +- libcxx/include/__type_traits/is_void.h | 4 +- libcxx/include/__type_traits/is_volatile.h | 4 +- libcxx/include/__type_traits/make_signed.h | 2 +- libcxx/include/__type_traits/make_unsigned.h | 2 +- libcxx/include/__type_traits/negation.h | 4 +- libcxx/include/__type_traits/rank.h | 10 +- .../__type_traits/remove_all_extents.h | 2 +- libcxx/include/__type_traits/remove_const.h | 2 +- libcxx/include/__type_traits/remove_cv.h | 2 +- libcxx/include/__type_traits/remove_cvref.h | 2 +- libcxx/include/__type_traits/remove_extent.h | 2 +- libcxx/include/__type_traits/remove_pointer.h | 2 +- .../include/__type_traits/remove_reference.h | 2 +- .../include/__type_traits/remove_volatile.h | 2 +- libcxx/include/__type_traits/type_identity.h | 2 +- .../include/__type_traits/underlying_type.h | 2 +- libcxx/include/__type_traits/unwrap_ref.h | 4 +- libcxx/include/execution | 12 +- libcxx/include/variant | 2 +- .../algorithms/no_specializations.verify.cpp | 28 +++ .../no_specializations.verify.cpp | 23 +++ .../ranges/no_specializations.verify.cpp | 23 +++ .../type_traits/no_specializations.verify.cpp | 176 ++++++++++++++++++ .../format/no_specializations.verify.cpp | 23 +++ .../utilities/no_specializations.verify.cpp | 23 +++ 93 files changed, 591 insertions(+), 204 deletions(-) create mode 100644 libcxx/test/libcxx/algorithms/no_specializations.verify.cpp create mode 100644 libcxx/test/libcxx/language.support/no_specializations.verify.cpp create mode 100644 libcxx/test/libcxx/ranges/no_specializations.verify.cpp create mode 100644 libcxx/test/libcxx/type_traits/no_specializations.verify.cpp create mode 100644 libcxx/test/libcxx/utilities/format/no_specializations.verify.cpp create mode 100644 libcxx/test/libcxx/utilities/no_specializations.verify.cpp diff --git a/libcxx/include/__compare/compare_three_way_result.h b/libcxx/include/__compare/compare_three_way_result.h index d7508073433af4..6ee2eff00302d0 100644 --- a/libcxx/include/__compare/compare_three_way_result.h +++ b/libcxx/include/__compare/compare_three_way_result.h @@ -33,7 +33,8 @@ struct _LIBCPP_HIDE_FROM_ABI __compare_three_way_result< }; template -struct _LIBCPP_TEMPLATE_VIS compare_three_way_result : __compare_three_way_result<_Tp, _Up, void> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS compare_three_way_result + : __compare_three_way_result<_Tp, _Up, void> {}; template using compare_three_way_result_t = typename compare_three_way_result<_Tp, _Up>::type; diff --git a/libcxx/include/__config b/libcxx/include/__config index 5d5c90d7b87a7b..5a0d22588d6dac 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1168,6 +1168,13 @@ typedef __char32_t char32_t; # define _LIBCPP_NODEBUG [[__gnu__::__nodebug__]] +# if __has_cpp_attribute(_Clang::__no_specializations__) +# define _LIBCPP_NO_SPECIALIZATIONS \ + [[_Clang::__no_specializations__("Users are not allowed to specialize this standard library entity")]] +# else +# define _LIBCPP_NO_SPECIALIZATIONS +# endif + # if __has_attribute(__standalone_debug__) # define _LIBCPP_STANDALONE_DEBUG __attribute__((__standalone_debug__)) # else diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h index 1c530fd5a5d03b..10f0ba9928ce7c 100644 --- a/libcxx/include/__format/format_arg.h +++ b/libcxx/include/__format/format_arg.h @@ -277,7 +277,7 @@ class __basic_format_arg_value { }; template -class _LIBCPP_TEMPLATE_VIS basic_format_arg { +class _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS basic_format_arg { public: class _LIBCPP_TEMPLATE_VIS handle; diff --git a/libcxx/include/__ranges/range_adaptor.h b/libcxx/include/__ranges/range_adaptor.h index d944a83406ba7f..4bcb53e1a58465 100644 --- a/libcxx/include/__ranges/range_adaptor.h +++ b/libcxx/include/__ranges/range_adaptor.h @@ -85,7 +85,7 @@ template <_RangeAdaptorClosure _Closure, _RangeAdaptorClosure _OtherClosure> # if _LIBCPP_STD_VER >= 23 template requires is_class_v<_Tp> && same_as<_Tp, remove_cv_t<_Tp>> -class range_adaptor_closure : public __range_adaptor_closure<_Tp> {}; +class _LIBCPP_NO_SPECIALIZATIONS range_adaptor_closure : public __range_adaptor_closure<_Tp> {}; # endif // _LIBCPP_STD_VER >= 23 } // namespace ranges diff --git a/libcxx/include/__type_traits/add_cv_quals.h b/libcxx/include/__type_traits/add_cv_quals.h index 6f671397622ad5..3fbc8d935d26c8 100644 --- a/libcxx/include/__type_traits/add_cv_quals.h +++ b/libcxx/include/__type_traits/add_cv_quals.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS add_const { +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS add_const { using type _LIBCPP_NODEBUG = const _Tp; }; @@ -28,7 +28,7 @@ using add_const_t = typename add_const<_Tp>::type; #endif template -struct _LIBCPP_TEMPLATE_VIS add_cv { +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS add_cv { using type _LIBCPP_NODEBUG = const volatile _Tp; }; @@ -38,7 +38,7 @@ using add_cv_t = typename add_cv<_Tp>::type; #endif template -struct _LIBCPP_TEMPLATE_VIS add_volatile { +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS add_volatile { using type _LIBCPP_NODEBUG = volatile _Tp; }; diff --git a/libcxx/include/__type_traits/add_lvalue_reference.h b/libcxx/include/__type_traits/add_lvalue_reference.h index b1ee6ed73c8ac4..f861420a10e05e 100644 --- a/libcxx/include/__type_traits/add_lvalue_reference.h +++ b/libcxx/include/__type_traits/add_lvalue_reference.h @@ -40,7 +40,7 @@ using __add_lvalue_reference_t = typename __add_lvalue_reference_impl<_Tp>::type #endif // __has_builtin(__add_lvalue_reference) template -struct add_lvalue_reference { +struct _LIBCPP_NO_SPECIALIZATIONS add_lvalue_reference { using type _LIBCPP_NODEBUG = __add_lvalue_reference_t<_Tp>; }; diff --git a/libcxx/include/__type_traits/add_pointer.h b/libcxx/include/__type_traits/add_pointer.h index b53d8eae708e2f..d10b5cf70c64db 100644 --- a/libcxx/include/__type_traits/add_pointer.h +++ b/libcxx/include/__type_traits/add_pointer.h @@ -41,7 +41,7 @@ using __add_pointer_t = typename __add_pointer_impl<_Tp>::type; #endif // !defined(_LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS) && __has_builtin(__add_pointer) template -struct add_pointer { +struct _LIBCPP_NO_SPECIALIZATIONS add_pointer { using type _LIBCPP_NODEBUG = __add_pointer_t<_Tp>; }; diff --git a/libcxx/include/__type_traits/add_rvalue_reference.h b/libcxx/include/__type_traits/add_rvalue_reference.h index d844ccc1f539dc..ed4f8633bce1f2 100644 --- a/libcxx/include/__type_traits/add_rvalue_reference.h +++ b/libcxx/include/__type_traits/add_rvalue_reference.h @@ -40,7 +40,7 @@ using __add_rvalue_reference_t = typename __add_rvalue_reference_impl<_Tp>::type #endif // __has_builtin(__add_rvalue_reference) template -struct add_rvalue_reference { +struct _LIBCPP_NO_SPECIALIZATIONS add_rvalue_reference { using type = __add_rvalue_reference_t<_Tp>; }; diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h index 3c39a351e35010..a5851c37fde612 100644 --- a/libcxx/include/__type_traits/aligned_storage.h +++ b/libcxx/include/__type_traits/aligned_storage.h @@ -68,7 +68,7 @@ struct __find_max_align<__type_list<_Head, _Tail...>, _Len> __select_align<_Len, _Head::value, __find_max_align<__type_list<_Tail...>, _Len>::value>::value> {}; template ::value> -struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS aligned_storage { +struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS aligned_storage { union _ALIGNAS(_Align) type { unsigned char __data[(_Len + _Align - 1) / _Align * _Align]; }; diff --git a/libcxx/include/__type_traits/aligned_union.h b/libcxx/include/__type_traits/aligned_union.h index fa7d985b56dd3d..1223dc25e40a09 100644 --- a/libcxx/include/__type_traits/aligned_union.h +++ b/libcxx/include/__type_traits/aligned_union.h @@ -33,7 +33,7 @@ struct __static_max<_I0, _I1, _In...> { }; template -struct _LIBCPP_DEPRECATED_IN_CXX23 aligned_union { +struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_NO_SPECIALIZATIONS aligned_union { static const size_t alignment_value = __static_max<_LIBCPP_PREFERRED_ALIGNOF(_Type0), _LIBCPP_PREFERRED_ALIGNOF(_Types)...>::value; static const size_t __len = __static_max<_Len, sizeof(_Type0), sizeof(_Types)...>::value; diff --git a/libcxx/include/__type_traits/alignment_of.h b/libcxx/include/__type_traits/alignment_of.h index 8871c8ce110d67..9801cac2cadd26 100644 --- a/libcxx/include/__type_traits/alignment_of.h +++ b/libcxx/include/__type_traits/alignment_of.h @@ -20,11 +20,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS alignment_of : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS alignment_of + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr size_t alignment_of_v = _LIBCPP_ALIGNOF(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr size_t alignment_of_v = _LIBCPP_ALIGNOF(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/conditional.h b/libcxx/include/__type_traits/conditional.h index 5b5445a8374271..20460c7f2e39c3 100644 --- a/libcxx/include/__type_traits/conditional.h +++ b/libcxx/include/__type_traits/conditional.h @@ -36,13 +36,19 @@ template using _If _LIBCPP_NODEBUG = typename _IfImpl<_Cond>::template _Select<_IfRes, _ElseRes>; template -struct _LIBCPP_TEMPLATE_VIS conditional { +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS conditional { using type _LIBCPP_NODEBUG = _If; }; + +_LIBCPP_DIAGNOSTIC_PUSH +#if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +#endif template struct _LIBCPP_TEMPLATE_VIS conditional { using type _LIBCPP_NODEBUG = _Then; }; +_LIBCPP_DIAGNOSTIC_POP #if _LIBCPP_STD_VER >= 14 template diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h index 4001d6c12d5df0..ad9656acd47ecc 100644 --- a/libcxx/include/__type_traits/conjunction.h +++ b/libcxx/include/__type_traits/conjunction.h @@ -47,16 +47,21 @@ struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)... #if _LIBCPP_STD_VER >= 17 template -struct conjunction : true_type {}; +struct _LIBCPP_NO_SPECIALIZATIONS conjunction : true_type {}; +_LIBCPP_DIAGNOSTIC_PUSH +# if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +# endif template struct conjunction<_Arg> : _Arg {}; template struct conjunction<_Arg, _Args...> : conditional_t> {}; +_LIBCPP_DIAGNOSTIC_POP template -inline constexpr bool conjunction_v = conjunction<_Args...>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool conjunction_v = conjunction<_Args...>::value; #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__type_traits/decay.h b/libcxx/include/__type_traits/decay.h index da0c4d340e9bbb..2fd73d3dde45df 100644 --- a/libcxx/include/__type_traits/decay.h +++ b/libcxx/include/__type_traits/decay.h @@ -30,7 +30,7 @@ template using __decay_t _LIBCPP_NODEBUG = __decay(_Tp); template -struct decay { +struct _LIBCPP_NO_SPECIALIZATIONS decay { using type _LIBCPP_NODEBUG = __decay_t<_Tp>; }; diff --git a/libcxx/include/__type_traits/disjunction.h b/libcxx/include/__type_traits/disjunction.h index d579de9b98439d..8e7a38413a985c 100644 --- a/libcxx/include/__type_traits/disjunction.h +++ b/libcxx/include/__type_traits/disjunction.h @@ -46,10 +46,10 @@ using _Or _LIBCPP_NODEBUG = typename _OrImpl::template _R #if _LIBCPP_STD_VER >= 17 template -struct disjunction : _Or<_Args...> {}; +struct _LIBCPP_NO_SPECIALIZATIONS disjunction : _Or<_Args...> {}; template -inline constexpr bool disjunction_v = _Or<_Args...>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool disjunction_v = _Or<_Args...>::value; #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__type_traits/enable_if.h b/libcxx/include/__type_traits/enable_if.h index 77da9622ca28fc..b572092542d4ab 100644 --- a/libcxx/include/__type_traits/enable_if.h +++ b/libcxx/include/__type_traits/enable_if.h @@ -18,11 +18,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS enable_if {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS enable_if{}; + +_LIBCPP_DIAGNOSTIC_PUSH +#if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +#endif template struct _LIBCPP_TEMPLATE_VIS enable_if { typedef _Tp type; }; +_LIBCPP_DIAGNOSTIC_POP template using __enable_if_t _LIBCPP_NODEBUG = typename enable_if<_Bp, _Tp>::type; diff --git a/libcxx/include/__type_traits/extent.h b/libcxx/include/__type_traits/extent.h index 1c34a4db1c4b52..6f3db916f96dce 100644 --- a/libcxx/include/__type_traits/extent.h +++ b/libcxx/include/__type_traits/extent.h @@ -22,11 +22,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__array_extent) template -struct _LIBCPP_TEMPLATE_VIS extent : integral_constant {}; +struct _LIBCPP_NO_SPECIALIZATIONS _LIBCPP_TEMPLATE_VIS extent : integral_constant {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr size_t extent_v = __array_extent(_Tp, _Ip); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr size_t extent_v = __array_extent(_Tp, _Ip); # endif #else // __has_builtin(__array_extent) diff --git a/libcxx/include/__type_traits/has_unique_object_representation.h b/libcxx/include/__type_traits/has_unique_object_representation.h index 98c440c16bf26b..d92fef0b5d2baa 100644 --- a/libcxx/include/__type_traits/has_unique_object_representation.h +++ b/libcxx/include/__type_traits/has_unique_object_representation.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 template -struct _LIBCPP_TEMPLATE_VIS has_unique_object_representations +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS has_unique_object_representations // TODO: We work around a Clang and GCC bug in __has_unique_object_representations by using remove_all_extents // even though it should not be necessary. This was reported to the compilers: // - Clang: https://github.com/llvm/llvm-project/issues/95311 @@ -31,7 +31,8 @@ struct _LIBCPP_TEMPLATE_VIS has_unique_object_representations : public integral_constant)> {}; template -inline constexpr bool has_unique_object_representations_v = __has_unique_object_representations(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool has_unique_object_representations_v = + __has_unique_object_representations(_Tp); #endif diff --git a/libcxx/include/__type_traits/has_virtual_destructor.h b/libcxx/include/__type_traits/has_virtual_destructor.h index 4ce96e649e67a1..98fa3cf6923987 100644 --- a/libcxx/include/__type_traits/has_virtual_destructor.h +++ b/libcxx/include/__type_traits/has_virtual_destructor.h @@ -19,11 +19,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS has_virtual_destructor : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS has_virtual_destructor + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool has_virtual_destructor_v = __has_virtual_destructor(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool has_virtual_destructor_v = __has_virtual_destructor(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/integral_constant.h b/libcxx/include/__type_traits/integral_constant.h index b8c75c546aa942..8feeff630d8741 100644 --- a/libcxx/include/__type_traits/integral_constant.h +++ b/libcxx/include/__type_traits/integral_constant.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS integral_constant { +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS integral_constant { static inline _LIBCPP_CONSTEXPR const _Tp value = __v; typedef _Tp value_type; typedef integral_constant type; diff --git a/libcxx/include/__type_traits/invoke.h b/libcxx/include/__type_traits/invoke.h index 013293bec49b9f..ba4d539ee20b0e 100644 --- a/libcxx/include/__type_traits/invoke.h +++ b/libcxx/include/__type_traits/invoke.h @@ -278,34 +278,37 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Ret __invoke_r(_Args&&... _ // is_invocable template -struct _LIBCPP_TEMPLATE_VIS is_invocable : bool_constant<__is_invocable_v<_Fn, _Args...>> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_invocable : bool_constant<__is_invocable_v<_Fn, _Args...>> {}; template -struct _LIBCPP_TEMPLATE_VIS is_invocable_r : bool_constant<__is_invocable_r_v<_Ret, _Fn, _Args...>> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_invocable_r + : bool_constant<__is_invocable_r_v<_Ret, _Fn, _Args...>> {}; template -inline constexpr bool is_invocable_v = __is_invocable_v<_Fn, _Args...>; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_invocable_v = __is_invocable_v<_Fn, _Args...>; template -inline constexpr bool is_invocable_r_v = __is_invocable_r_v<_Ret, _Fn, _Args...>; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_invocable_r_v = __is_invocable_r_v<_Ret, _Fn, _Args...>; // is_nothrow_invocable template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_invocable : bool_constant<__nothrow_invokable<_Fn, _Args...>::value> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_invocable + : bool_constant<__nothrow_invokable<_Fn, _Args...>::value> {}; template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_invocable_r : bool_constant<__nothrow_invokable_r<_Ret, _Fn, _Args...>::value> { -}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_invocable_r + : bool_constant<__nothrow_invokable_r<_Ret, _Fn, _Args...>::value> {}; template -inline constexpr bool is_nothrow_invocable_v = is_nothrow_invocable<_Fn, _Args...>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_invocable_v = is_nothrow_invocable<_Fn, _Args...>::value; template -inline constexpr bool is_nothrow_invocable_r_v = is_nothrow_invocable_r<_Ret, _Fn, _Args...>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_invocable_r_v = + is_nothrow_invocable_r<_Ret, _Fn, _Args...>::value; template -struct _LIBCPP_TEMPLATE_VIS invoke_result : __invoke_result<_Fn, _Args...> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS invoke_result : __invoke_result<_Fn, _Args...> {}; template using invoke_result_t = typename invoke_result<_Fn, _Args...>::type; diff --git a/libcxx/include/__type_traits/is_abstract.h b/libcxx/include/__type_traits/is_abstract.h index 4aa456be1c48e8..20b9e56cd60ebd 100644 --- a/libcxx/include/__type_traits/is_abstract.h +++ b/libcxx/include/__type_traits/is_abstract.h @@ -19,11 +19,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_abstract : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_abstract + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_abstract_v = __is_abstract(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_abstract_v = __is_abstract(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_aggregate.h b/libcxx/include/__type_traits/is_aggregate.h index 4e0988071adeec..b5098ee1bcf1a2 100644 --- a/libcxx/include/__type_traits/is_aggregate.h +++ b/libcxx/include/__type_traits/is_aggregate.h @@ -21,10 +21,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 template -struct _LIBCPP_TEMPLATE_VIS is_aggregate : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_aggregate + : public integral_constant {}; template -inline constexpr bool is_aggregate_v = __is_aggregate(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_aggregate_v = __is_aggregate(_Tp); #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__type_traits/is_arithmetic.h b/libcxx/include/__type_traits/is_arithmetic.h index c9713e1840a7b1..fcb31e9f5d5da4 100644 --- a/libcxx/include/__type_traits/is_arithmetic.h +++ b/libcxx/include/__type_traits/is_arithmetic.h @@ -21,12 +21,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_arithmetic +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_arithmetic : public integral_constant::value || is_floating_point<_Tp>::value> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_arithmetic_v = is_arithmetic<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_arithmetic_v = is_arithmetic<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_array.h b/libcxx/include/__type_traits/is_array.h index f34204e19ed899..0bde0aa970f884 100644 --- a/libcxx/include/__type_traits/is_array.h +++ b/libcxx/include/__type_traits/is_array.h @@ -23,11 +23,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD (!defined(_LIBCPP_COMPILER_CLANG_BASED) || (defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1900)) template -struct _LIBCPP_TEMPLATE_VIS is_array : _BoolConstant<__is_array(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_array : _BoolConstant<__is_array(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_array_v = __is_array(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_array_v = __is_array(_Tp); # endif #else diff --git a/libcxx/include/__type_traits/is_assignable.h b/libcxx/include/__type_traits/is_assignable.h index cfb46997778782..2a87bcc673e51e 100644 --- a/libcxx/include/__type_traits/is_assignable.h +++ b/libcxx/include/__type_traits/is_assignable.h @@ -21,30 +21,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_assignable : _BoolConstant<__is_assignable(_Tp, _Up)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_assignable : _BoolConstant<__is_assignable(_Tp, _Up)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_assignable_v = __is_assignable(_Tp, _Arg); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_assignable_v = __is_assignable(_Tp, _Arg); #endif template -struct _LIBCPP_TEMPLATE_VIS is_copy_assignable +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_copy_assignable : public integral_constant, __add_lvalue_reference_t)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_copy_assignable_v = is_copy_assignable<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_copy_assignable_v = is_copy_assignable<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_move_assignable +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_move_assignable : public integral_constant, __add_rvalue_reference_t<_Tp>)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_move_assignable_v = is_move_assignable<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_move_assignable_v = is_move_assignable<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_base_of.h b/libcxx/include/__type_traits/is_base_of.h index 488b63719eb600..a48b8caded721a 100644 --- a/libcxx/include/__type_traits/is_base_of.h +++ b/libcxx/include/__type_traits/is_base_of.h @@ -19,21 +19,23 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_base_of : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_base_of + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_base_of_v = __is_base_of(_Bp, _Dp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_base_of_v = __is_base_of(_Bp, _Dp); #endif #if _LIBCPP_STD_VER >= 26 # if __has_builtin(__builtin_is_virtual_base_of) template -struct _LIBCPP_TEMPLATE_VIS is_virtual_base_of : public bool_constant<__builtin_is_virtual_base_of(_Base, _Derived)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_virtual_base_of + : public bool_constant<__builtin_is_virtual_base_of(_Base, _Derived)> {}; template -inline constexpr bool is_virtual_base_of_v = __builtin_is_virtual_base_of(_Base, _Derived); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_virtual_base_of_v = __builtin_is_virtual_base_of(_Base, _Derived); # endif #endif diff --git a/libcxx/include/__type_traits/is_bounded_array.h b/libcxx/include/__type_traits/is_bounded_array.h index fd794eb9a6935e..a853cc6d821eda 100644 --- a/libcxx/include/__type_traits/is_bounded_array.h +++ b/libcxx/include/__type_traits/is_bounded_array.h @@ -27,12 +27,18 @@ inline const bool __is_bounded_array_v<_Tp[_Np]> = true; #if _LIBCPP_STD_VER >= 20 template -struct _LIBCPP_TEMPLATE_VIS is_bounded_array : false_type {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_bounded_array : false_type {}; + +_LIBCPP_DIAGNOSTIC_PUSH +# if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +# endif template struct _LIBCPP_TEMPLATE_VIS is_bounded_array<_Tp[_Np]> : true_type {}; +_LIBCPP_DIAGNOSTIC_POP template -inline constexpr bool is_bounded_array_v = is_bounded_array<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_bounded_array_v = is_bounded_array<_Tp>::value; #endif diff --git a/libcxx/include/__type_traits/is_class.h b/libcxx/include/__type_traits/is_class.h index 034f76a7865e3d..5fce840058c056 100644 --- a/libcxx/include/__type_traits/is_class.h +++ b/libcxx/include/__type_traits/is_class.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_class : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_class : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_class_v = __is_class(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_class_v = __is_class(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_compound.h b/libcxx/include/__type_traits/is_compound.h index cd208ceab28863..6dc22f8a911164 100644 --- a/libcxx/include/__type_traits/is_compound.h +++ b/libcxx/include/__type_traits/is_compound.h @@ -22,11 +22,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_compound) template -struct _LIBCPP_TEMPLATE_VIS is_compound : _BoolConstant<__is_compound(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_compound : _BoolConstant<__is_compound(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_compound_v = __is_compound(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_compound_v = __is_compound(_Tp); # endif #else // __has_builtin(__is_compound) diff --git a/libcxx/include/__type_traits/is_const.h b/libcxx/include/__type_traits/is_const.h index 47ef70872b790a..78f98f3a96d4ff 100644 --- a/libcxx/include/__type_traits/is_const.h +++ b/libcxx/include/__type_traits/is_const.h @@ -21,11 +21,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_const) template -struct _LIBCPP_TEMPLATE_VIS is_const : _BoolConstant<__is_const(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_const : _BoolConstant<__is_const(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_const_v = __is_const(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_const_v = __is_const(_Tp); # endif #else diff --git a/libcxx/include/__type_traits/is_constructible.h b/libcxx/include/__type_traits/is_constructible.h index 567bd165c71520..6c7636e0bb5098 100644 --- a/libcxx/include/__type_traits/is_constructible.h +++ b/libcxx/include/__type_traits/is_constructible.h @@ -21,37 +21,39 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_constructible : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_constructible + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...); #endif template -struct _LIBCPP_TEMPLATE_VIS is_copy_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_copy_constructible : public integral_constant)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_copy_constructible_v = is_copy_constructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_copy_constructible_v = is_copy_constructible<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_move_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_move_constructible : public integral_constant)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_move_constructible_v = is_move_constructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_move_constructible_v = is_move_constructible<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_default_constructible : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_default_constructible + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_default_constructible_v = __is_constructible(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_default_constructible_v = __is_constructible(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_convertible.h b/libcxx/include/__type_traits/is_convertible.h index 414c2a6d6a0de0..61f6cf644124e1 100644 --- a/libcxx/include/__type_traits/is_convertible.h +++ b/libcxx/include/__type_traits/is_convertible.h @@ -19,11 +19,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_convertible : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_convertible + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_convertible_v = __is_convertible(_From, _To); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_convertible_v = __is_convertible(_From, _To); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_destructible.h b/libcxx/include/__type_traits/is_destructible.h index 3248b07d36ee67..5fe923d303c531 100644 --- a/libcxx/include/__type_traits/is_destructible.h +++ b/libcxx/include/__type_traits/is_destructible.h @@ -25,11 +25,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_destructible) template -struct _LIBCPP_TEMPLATE_VIS is_destructible : _BoolConstant<__is_destructible(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_destructible : _BoolConstant<__is_destructible(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_destructible_v = __is_destructible(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_destructible_v = __is_destructible(_Tp); # endif #else // __has_builtin(__is_destructible) diff --git a/libcxx/include/__type_traits/is_empty.h b/libcxx/include/__type_traits/is_empty.h index 951d93b5a2f10e..8e66cd97a9f284 100644 --- a/libcxx/include/__type_traits/is_empty.h +++ b/libcxx/include/__type_traits/is_empty.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_empty : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_empty : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_empty_v = __is_empty(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_empty_v = __is_empty(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_enum.h b/libcxx/include/__type_traits/is_enum.h index 2fab6db2c8d50f..bc210ea289e62f 100644 --- a/libcxx/include/__type_traits/is_enum.h +++ b/libcxx/include/__type_traits/is_enum.h @@ -19,20 +19,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_enum : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_enum : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_enum_v = __is_enum(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_enum_v = __is_enum(_Tp); #endif #if _LIBCPP_STD_VER >= 23 template -struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {}; template -inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp); #endif // _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__type_traits/is_execution_policy.h b/libcxx/include/__type_traits/is_execution_policy.h index a2d876db030927..84393e83a8efc1 100644 --- a/libcxx/include/__type_traits/is_execution_policy.h +++ b/libcxx/include/__type_traits/is_execution_policy.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline constexpr bool is_execution_policy_v = false; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_execution_policy_v = false; template inline constexpr bool __is_unsequenced_execution_policy_impl = false; diff --git a/libcxx/include/__type_traits/is_final.h b/libcxx/include/__type_traits/is_final.h index 499c5e3a1edca4..19d3ac3ecd35f9 100644 --- a/libcxx/include/__type_traits/is_final.h +++ b/libcxx/include/__type_traits/is_final.h @@ -23,12 +23,12 @@ struct _LIBCPP_TEMPLATE_VIS __libcpp_is_final : public integral_constant= 14 template -struct _LIBCPP_TEMPLATE_VIS is_final : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_final : public integral_constant {}; #endif #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_final_v = __is_final(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_final_v = __is_final(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_floating_point.h b/libcxx/include/__type_traits/is_floating_point.h index add34782dfa099..563ecce891f22d 100644 --- a/libcxx/include/__type_traits/is_floating_point.h +++ b/libcxx/include/__type_traits/is_floating_point.h @@ -27,11 +27,12 @@ template <> struct __libcpp_is_floating_point : public tru // clang-format on template -struct _LIBCPP_TEMPLATE_VIS is_floating_point : public __libcpp_is_floating_point<__remove_cv_t<_Tp> > {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_floating_point + : public __libcpp_is_floating_point<__remove_cv_t<_Tp> > {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_floating_point_v = is_floating_point<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_floating_point_v = is_floating_point<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_function.h b/libcxx/include/__type_traits/is_function.h index 98fedd0ad96d9b..63b842b26c4ce9 100644 --- a/libcxx/include/__type_traits/is_function.h +++ b/libcxx/include/__type_traits/is_function.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_function : integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_function : integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_function_v = __is_function(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_function_v = __is_function(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_fundamental.h b/libcxx/include/__type_traits/is_fundamental.h index 55f8e41f75f457..03e25474bea777 100644 --- a/libcxx/include/__type_traits/is_fundamental.h +++ b/libcxx/include/__type_traits/is_fundamental.h @@ -23,11 +23,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_fundamental) template -struct _LIBCPP_TEMPLATE_VIS is_fundamental : _BoolConstant<__is_fundamental(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_fundamental : _BoolConstant<__is_fundamental(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_fundamental_v = __is_fundamental(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_fundamental_v = __is_fundamental(_Tp); # endif #else // __has_builtin(__is_fundamental) diff --git a/libcxx/include/__type_traits/is_implicit_lifetime.h b/libcxx/include/__type_traits/is_implicit_lifetime.h index 2aba420bd2b59d..8b992095f105b9 100644 --- a/libcxx/include/__type_traits/is_implicit_lifetime.h +++ b/libcxx/include/__type_traits/is_implicit_lifetime.h @@ -22,10 +22,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD # if __has_builtin(__builtin_is_implicit_lifetime) template -struct _LIBCPP_TEMPLATE_VIS is_implicit_lifetime : public bool_constant<__builtin_is_implicit_lifetime(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_implicit_lifetime + : public bool_constant<__builtin_is_implicit_lifetime(_Tp)> {}; template -inline constexpr bool is_implicit_lifetime_v = __builtin_is_implicit_lifetime(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_implicit_lifetime_v = __builtin_is_implicit_lifetime(_Tp); # endif #endif diff --git a/libcxx/include/__type_traits/is_integral.h b/libcxx/include/__type_traits/is_integral.h index 763b6ac3d1077d..6ae9b31a8e9b63 100644 --- a/libcxx/include/__type_traits/is_integral.h +++ b/libcxx/include/__type_traits/is_integral.h @@ -50,11 +50,11 @@ template <> struct __libcpp_is_integral<__uint128_t> { enum { va #if __has_builtin(__is_integral) template -struct _LIBCPP_TEMPLATE_VIS is_integral : _BoolConstant<__is_integral(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_integral : _BoolConstant<__is_integral(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_integral_v = __is_integral(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_integral_v = __is_integral(_Tp); # endif #else diff --git a/libcxx/include/__type_traits/is_literal_type.h b/libcxx/include/__type_traits/is_literal_type.h index 10e23bceffbda3..e78343ee2f0be3 100644 --- a/libcxx/include/__type_traits/is_literal_type.h +++ b/libcxx/include/__type_traits/is_literal_type.h @@ -20,12 +20,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS) template -struct _LIBCPP_TEMPLATE_VIS -_LIBCPP_DEPRECATED_IN_CXX17 is_literal_type : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS is_literal_type + : public integral_constant {}; # if _LIBCPP_STD_VER >= 17 template -_LIBCPP_DEPRECATED_IN_CXX17 inline constexpr bool is_literal_type_v = __is_literal_type(_Tp); +_LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_literal_type_v = __is_literal_type(_Tp); # endif // _LIBCPP_STD_VER >= 17 #endif // _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS) diff --git a/libcxx/include/__type_traits/is_member_pointer.h b/libcxx/include/__type_traits/is_member_pointer.h index 3e2753ac4228c2..8a4f1fe0c736fa 100644 --- a/libcxx/include/__type_traits/is_member_pointer.h +++ b/libcxx/include/__type_traits/is_member_pointer.h @@ -19,23 +19,25 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_member_pointer : _BoolConstant<__is_member_pointer(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_member_pointer : _BoolConstant<__is_member_pointer(_Tp)> {}; template -struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer : _BoolConstant<__is_member_object_pointer(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_member_object_pointer + : _BoolConstant<__is_member_object_pointer(_Tp)> {}; template -struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer : _BoolConstant<__is_member_function_pointer(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_member_function_pointer + : _BoolConstant<__is_member_function_pointer(_Tp)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_member_pointer_v = __is_member_pointer(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_member_pointer_v = __is_member_pointer(_Tp); template -inline constexpr bool is_member_object_pointer_v = __is_member_object_pointer(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_member_object_pointer_v = __is_member_object_pointer(_Tp); template -inline constexpr bool is_member_function_pointer_v = __is_member_function_pointer(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_member_function_pointer_v = __is_member_function_pointer(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_nothrow_assignable.h b/libcxx/include/__type_traits/is_nothrow_assignable.h index 7e00c741f83e30..4727deb6c3f854 100644 --- a/libcxx/include/__type_traits/is_nothrow_assignable.h +++ b/libcxx/include/__type_traits/is_nothrow_assignable.h @@ -21,34 +21,34 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_assignable : public integral_constant { -}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_assignable + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_assignable_v = __is_nothrow_assignable(_Tp, _Arg); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_assignable_v = __is_nothrow_assignable(_Tp, _Arg); #endif template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_assignable +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_copy_assignable : public integral_constant< bool, __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_copy_assignable_v = is_nothrow_copy_assignable<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_copy_assignable_v = is_nothrow_copy_assignable<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_assignable +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_move_assignable : public integral_constant, __add_rvalue_reference_t<_Tp>)> { }; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_move_assignable_v = is_nothrow_move_assignable<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_move_assignable_v = is_nothrow_move_assignable<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_nothrow_constructible.h b/libcxx/include/__type_traits/is_nothrow_constructible.h index 58d2b2475140b6..1e4eebd006ec1b 100644 --- a/libcxx/include/__type_traits/is_nothrow_constructible.h +++ b/libcxx/include/__type_traits/is_nothrow_constructible.h @@ -21,39 +21,42 @@ _LIBCPP_BEGIN_NAMESPACE_STD template < class _Tp, class... _Args> -struct _LIBCPP_TEMPLATE_VIS is_nothrow_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_constructible : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_constructible_v = is_nothrow_constructible<_Tp, _Args...>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_constructible_v = + is_nothrow_constructible<_Tp, _Args...>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_copy_constructible : public integral_constant< bool, __is_nothrow_constructible(_Tp, __add_lvalue_reference_t)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_copy_constructible_v = is_nothrow_copy_constructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_copy_constructible_v = + is_nothrow_copy_constructible<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_move_constructible : public integral_constant)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_move_constructible_v = is_nothrow_move_constructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_move_constructible_v = + is_nothrow_move_constructible<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_default_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_default_constructible : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_default_constructible_v = __is_nothrow_constructible(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_default_constructible_v = __is_nothrow_constructible(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_nothrow_convertible.h b/libcxx/include/__type_traits/is_nothrow_convertible.h index bfc5a94cbadec6..8b1aacf8f28768 100644 --- a/libcxx/include/__type_traits/is_nothrow_convertible.h +++ b/libcxx/include/__type_traits/is_nothrow_convertible.h @@ -29,10 +29,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD # if __has_builtin(__is_nothrow_convertible) template -struct is_nothrow_convertible : bool_constant<__is_nothrow_convertible(_Tp, _Up)> {}; +struct _LIBCPP_NO_SPECIALIZATIONS is_nothrow_convertible : bool_constant<__is_nothrow_convertible(_Tp, _Up)> {}; template -inline constexpr bool is_nothrow_convertible_v = __is_nothrow_convertible(_Tp, _Up); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_convertible_v = __is_nothrow_convertible(_Tp, _Up); # else // __has_builtin(__is_nothrow_convertible) diff --git a/libcxx/include/__type_traits/is_nothrow_destructible.h b/libcxx/include/__type_traits/is_nothrow_destructible.h index 41271a38f37116..a363ad6b4af3bc 100644 --- a/libcxx/include/__type_traits/is_nothrow_destructible.h +++ b/libcxx/include/__type_traits/is_nothrow_destructible.h @@ -24,7 +24,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_nothrow_destructible) template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_destructible : integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_destructible + : integral_constant {}; #else @@ -55,7 +56,7 @@ struct _LIBCPP_TEMPLATE_VIS is_nothrow_destructible<_Tp&&> : public true_type {} #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_nothrow_destructible_v = is_nothrow_destructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_destructible_v = is_nothrow_destructible<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_null_pointer.h b/libcxx/include/__type_traits/is_null_pointer.h index abc5d142562f1e..fc6c6a69de338a 100644 --- a/libcxx/include/__type_traits/is_null_pointer.h +++ b/libcxx/include/__type_traits/is_null_pointer.h @@ -24,11 +24,12 @@ inline const bool __is_null_pointer_v = __is_same(__remove_cv(_Tp), nullptr_t); #if _LIBCPP_STD_VER >= 14 template -struct _LIBCPP_TEMPLATE_VIS is_null_pointer : integral_constant> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_null_pointer + : integral_constant> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_null_pointer_v = __is_null_pointer_v<_Tp>; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_null_pointer_v = __is_null_pointer_v<_Tp>; # endif #endif // _LIBCPP_STD_VER >= 14 diff --git a/libcxx/include/__type_traits/is_object.h b/libcxx/include/__type_traits/is_object.h index ec04508402ce51..eba4ab5cb8806e 100644 --- a/libcxx/include/__type_traits/is_object.h +++ b/libcxx/include/__type_traits/is_object.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_object : _BoolConstant<__is_object(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_object : _BoolConstant<__is_object(_Tp)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_object_v = __is_object(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_object_v = __is_object(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_pod.h b/libcxx/include/__type_traits/is_pod.h index 5888fbf457d8b1..a57662400394a8 100644 --- a/libcxx/include/__type_traits/is_pod.h +++ b/libcxx/include/__type_traits/is_pod.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_pod : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_pod : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_pod_v = __is_pod(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_pod_v = __is_pod(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_pointer.h b/libcxx/include/__type_traits/is_pointer.h index 9701e57807cf6f..5647bf4045ff3d 100644 --- a/libcxx/include/__type_traits/is_pointer.h +++ b/libcxx/include/__type_traits/is_pointer.h @@ -22,11 +22,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_pointer) template -struct _LIBCPP_TEMPLATE_VIS is_pointer : _BoolConstant<__is_pointer(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_pointer : _BoolConstant<__is_pointer(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_pointer_v = __is_pointer(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_pointer_v = __is_pointer(_Tp); # endif #else // __has_builtin(__is_pointer) diff --git a/libcxx/include/__type_traits/is_polymorphic.h b/libcxx/include/__type_traits/is_polymorphic.h index d122e1c87775bd..17e9c21c0d6a17 100644 --- a/libcxx/include/__type_traits/is_polymorphic.h +++ b/libcxx/include/__type_traits/is_polymorphic.h @@ -19,11 +19,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_polymorphic : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_polymorphic + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_polymorphic_v = __is_polymorphic(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_polymorphic_v = __is_polymorphic(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_reference.h b/libcxx/include/__type_traits/is_reference.h index cc157a438e4913..564e888b77c137 100644 --- a/libcxx/include/__type_traits/is_reference.h +++ b/libcxx/include/__type_traits/is_reference.h @@ -19,26 +19,28 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_reference : _BoolConstant<__is_reference(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_reference : _BoolConstant<__is_reference(_Tp)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_reference_v = __is_reference(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_reference_v = __is_reference(_Tp); #endif #if __has_builtin(__is_lvalue_reference) && __has_builtin(__is_rvalue_reference) template -struct _LIBCPP_TEMPLATE_VIS is_lvalue_reference : _BoolConstant<__is_lvalue_reference(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_lvalue_reference : _BoolConstant<__is_lvalue_reference(_Tp)> { +}; template -struct _LIBCPP_TEMPLATE_VIS is_rvalue_reference : _BoolConstant<__is_rvalue_reference(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_rvalue_reference : _BoolConstant<__is_rvalue_reference(_Tp)> { +}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_lvalue_reference_v = __is_lvalue_reference(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_lvalue_reference_v = __is_lvalue_reference(_Tp); template -inline constexpr bool is_rvalue_reference_v = __is_rvalue_reference(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_rvalue_reference_v = __is_rvalue_reference(_Tp); # endif #else // __has_builtin(__is_lvalue_reference) diff --git a/libcxx/include/__type_traits/is_same.h b/libcxx/include/__type_traits/is_same.h index 400f870904d2d0..befab8999ae261 100644 --- a/libcxx/include/__type_traits/is_same.h +++ b/libcxx/include/__type_traits/is_same.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_same : _BoolConstant<__is_same(_Tp, _Up)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_same : _BoolConstant<__is_same(_Tp, _Up)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_same_v = __is_same(_Tp, _Up); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_same_v = __is_same(_Tp, _Up); #endif // _IsSame has the same effect as is_same but instantiates fewer types: diff --git a/libcxx/include/__type_traits/is_scalar.h b/libcxx/include/__type_traits/is_scalar.h index 242023a6877c94..6ef57e1dd22d56 100644 --- a/libcxx/include/__type_traits/is_scalar.h +++ b/libcxx/include/__type_traits/is_scalar.h @@ -26,11 +26,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_scalar) template -struct _LIBCPP_TEMPLATE_VIS is_scalar : _BoolConstant<__is_scalar(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_scalar : _BoolConstant<__is_scalar(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_scalar_v = __is_scalar(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_scalar_v = __is_scalar(_Tp); # endif #else // __has_builtin(__is_scalar) diff --git a/libcxx/include/__type_traits/is_signed.h b/libcxx/include/__type_traits/is_signed.h index fd6f93e1823627..535324fdbfc142 100644 --- a/libcxx/include/__type_traits/is_signed.h +++ b/libcxx/include/__type_traits/is_signed.h @@ -23,11 +23,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_signed) template -struct _LIBCPP_TEMPLATE_VIS is_signed : _BoolConstant<__is_signed(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_signed : _BoolConstant<__is_signed(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_signed_v = __is_signed(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_signed_v = __is_signed(_Tp); # endif #else // __has_builtin(__is_signed) diff --git a/libcxx/include/__type_traits/is_standard_layout.h b/libcxx/include/__type_traits/is_standard_layout.h index 76484f3e2a301f..e70d0f365416e4 100644 --- a/libcxx/include/__type_traits/is_standard_layout.h +++ b/libcxx/include/__type_traits/is_standard_layout.h @@ -19,11 +19,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_standard_layout : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_standard_layout + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_standard_layout_v = __is_standard_layout(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_standard_layout_v = __is_standard_layout(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_swappable.h b/libcxx/include/__type_traits/is_swappable.h index aa5eecd9abe0f1..c7527814cffb97 100644 --- a/libcxx/include/__type_traits/is_swappable.h +++ b/libcxx/include/__type_traits/is_swappable.h @@ -74,30 +74,33 @@ inline const bool __is_nothrow_swappable_with_v<_Tp, _Up, true> = #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_swappable_with_v = __is_swappable_with_v<_Tp, _Up>; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_swappable_with_v = __is_swappable_with_v<_Tp, _Up>; template -struct _LIBCPP_TEMPLATE_VIS is_swappable_with : bool_constant> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_swappable_with + : bool_constant> {}; template -inline constexpr bool is_swappable_v = +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_swappable_v = is_swappable_with_v<__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<_Tp>>; template -struct _LIBCPP_TEMPLATE_VIS is_swappable : bool_constant> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_swappable : bool_constant> {}; template -inline constexpr bool is_nothrow_swappable_with_v = __is_nothrow_swappable_with_v<_Tp, _Up>; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_swappable_with_v = __is_nothrow_swappable_with_v<_Tp, _Up>; template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_swappable_with : bool_constant> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_swappable_with + : bool_constant> {}; template -inline constexpr bool is_nothrow_swappable_v = +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_nothrow_swappable_v = is_nothrow_swappable_with_v<__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<_Tp>>; template -struct _LIBCPP_TEMPLATE_VIS is_nothrow_swappable : bool_constant> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_nothrow_swappable + : bool_constant> {}; #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__type_traits/is_trivial.h b/libcxx/include/__type_traits/is_trivial.h index 0007c7446d5e5f..a4ca2d6b26d0e6 100644 --- a/libcxx/include/__type_traits/is_trivial.h +++ b/libcxx/include/__type_traits/is_trivial.h @@ -19,11 +19,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_trivial : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivial : public integral_constant { +}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivial_v = __is_trivial(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivial_v = __is_trivial(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_trivially_assignable.h b/libcxx/include/__type_traits/is_trivially_assignable.h index 7720c3e637506a..d91b6d89c7e21b 100644 --- a/libcxx/include/__type_traits/is_trivially_assignable.h +++ b/libcxx/include/__type_traits/is_trivially_assignable.h @@ -21,33 +21,36 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct is_trivially_assignable : integral_constant {}; +struct _LIBCPP_NO_SPECIALIZATIONS is_trivially_assignable + : integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_assignable_v = __is_trivially_assignable(_Tp, _Arg); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_assignable_v = __is_trivially_assignable(_Tp, _Arg); #endif template -struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_assignable +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_copy_assignable : public integral_constant< bool, __is_trivially_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_copy_assignable_v = is_trivially_copy_assignable<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_copy_assignable_v = + is_trivially_copy_assignable<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_trivially_move_assignable +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_move_assignable : public integral_constant< bool, __is_trivially_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_move_assignable_v = is_trivially_move_assignable<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_move_assignable_v = + is_trivially_move_assignable<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_trivially_constructible.h b/libcxx/include/__type_traits/is_trivially_constructible.h index 3a77e9fe164da1..4a212d462b63f2 100644 --- a/libcxx/include/__type_traits/is_trivially_constructible.h +++ b/libcxx/include/__type_traits/is_trivially_constructible.h @@ -21,39 +21,43 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_trivially_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_constructible : integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_constructible_v = __is_trivially_constructible(_Tp, _Args...); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_constructible_v = + __is_trivially_constructible(_Tp, _Args...); #endif template -struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_copy_constructible : public integral_constant)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_copy_constructible_v = is_trivially_copy_constructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_copy_constructible_v = + is_trivially_copy_constructible<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_trivially_move_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_move_constructible : public integral_constant)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_move_constructible_v = is_trivially_move_constructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_move_constructible_v = + is_trivially_move_constructible<_Tp>::value; #endif template -struct _LIBCPP_TEMPLATE_VIS is_trivially_default_constructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_default_constructible : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_default_constructible_v = __is_trivially_constructible(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_default_constructible_v = + __is_trivially_constructible(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_trivially_copyable.h b/libcxx/include/__type_traits/is_trivially_copyable.h index 8eb3ba7581af15..72f1d6beae5d43 100644 --- a/libcxx/include/__type_traits/is_trivially_copyable.h +++ b/libcxx/include/__type_traits/is_trivially_copyable.h @@ -20,11 +20,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_trivially_copyable : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_copyable + : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_copyable_v = __is_trivially_copyable(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_copyable_v = __is_trivially_copyable(_Tp); #endif template diff --git a/libcxx/include/__type_traits/is_trivially_destructible.h b/libcxx/include/__type_traits/is_trivially_destructible.h index 5f9652f2a5011c..3bca575528e66a 100644 --- a/libcxx/include/__type_traits/is_trivially_destructible.h +++ b/libcxx/include/__type_traits/is_trivially_destructible.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_trivially_destructible) template -struct _LIBCPP_TEMPLATE_VIS is_trivially_destructible +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_trivially_destructible : public integral_constant {}; #elif __has_builtin(__has_trivial_destructor) @@ -39,7 +39,7 @@ struct _LIBCPP_TEMPLATE_VIS is_trivially_destructible #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_trivially_destructible_v = is_trivially_destructible<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_trivially_destructible_v = is_trivially_destructible<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_unbounded_array.h b/libcxx/include/__type_traits/is_unbounded_array.h index b0879476bd23e7..65a6e1a5276ab3 100644 --- a/libcxx/include/__type_traits/is_unbounded_array.h +++ b/libcxx/include/__type_traits/is_unbounded_array.h @@ -26,12 +26,18 @@ inline const bool __is_unbounded_array_v<_Tp[]> = true; #if _LIBCPP_STD_VER >= 20 template -struct _LIBCPP_TEMPLATE_VIS is_unbounded_array : false_type {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_unbounded_array : false_type {}; + +_LIBCPP_DIAGNOSTIC_PUSH +# if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +# endif template struct _LIBCPP_TEMPLATE_VIS is_unbounded_array<_Tp[]> : true_type {}; +_LIBCPP_DIAGNOSTIC_POP template -inline constexpr bool is_unbounded_array_v = is_unbounded_array<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_unbounded_array_v = is_unbounded_array<_Tp>::value; #endif diff --git a/libcxx/include/__type_traits/is_union.h b/libcxx/include/__type_traits/is_union.h index 1f009d993545ba..4be5ae01183976 100644 --- a/libcxx/include/__type_traits/is_union.h +++ b/libcxx/include/__type_traits/is_union.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_union : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_union : public integral_constant {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_union_v = __is_union(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_union_v = __is_union(_Tp); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_unsigned.h b/libcxx/include/__type_traits/is_unsigned.h index 48c5751ed70d8e..be855ee1d7fde8 100644 --- a/libcxx/include/__type_traits/is_unsigned.h +++ b/libcxx/include/__type_traits/is_unsigned.h @@ -23,11 +23,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_unsigned) template -struct _LIBCPP_TEMPLATE_VIS is_unsigned : _BoolConstant<__is_unsigned(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_unsigned : _BoolConstant<__is_unsigned(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_unsigned_v = __is_unsigned(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_unsigned_v = __is_unsigned(_Tp); # endif #else // __has_builtin(__is_unsigned) diff --git a/libcxx/include/__type_traits/is_void.h b/libcxx/include/__type_traits/is_void.h index 562faae9fba2cd..48d73ce49ac5c4 100644 --- a/libcxx/include/__type_traits/is_void.h +++ b/libcxx/include/__type_traits/is_void.h @@ -19,11 +19,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS is_void : _BoolConstant<__is_same(__remove_cv(_Tp), void)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_void : _BoolConstant<__is_same(__remove_cv(_Tp), void)> {}; #if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_void_v = __is_same(__remove_cv(_Tp), void); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_void_v = __is_same(__remove_cv(_Tp), void); #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_volatile.h b/libcxx/include/__type_traits/is_volatile.h index 87960a819c8fcb..033d1e3f3b8656 100644 --- a/libcxx/include/__type_traits/is_volatile.h +++ b/libcxx/include/__type_traits/is_volatile.h @@ -21,11 +21,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__is_volatile) template -struct _LIBCPP_TEMPLATE_VIS is_volatile : _BoolConstant<__is_volatile(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS is_volatile : _BoolConstant<__is_volatile(_Tp)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_volatile_v = __is_volatile(_Tp); +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_volatile_v = __is_volatile(_Tp); # endif #else diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h index 42614a420f9fbf..dff23d880dc30a 100644 --- a/libcxx/include/__type_traits/make_signed.h +++ b/libcxx/include/__type_traits/make_signed.h @@ -70,7 +70,7 @@ using __make_signed_t = __copy_cv_t<_Tp, typename __make_signed<__remove_cv_t<_T #endif // __has_builtin(__make_signed) template -struct make_signed { +struct _LIBCPP_NO_SPECIALIZATIONS make_signed { using type _LIBCPP_NODEBUG = __make_signed_t<_Tp>; }; diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h index 50928b03b0eb64..a83baa658e294c 100644 --- a/libcxx/include/__type_traits/make_unsigned.h +++ b/libcxx/include/__type_traits/make_unsigned.h @@ -72,7 +72,7 @@ using __make_unsigned_t = __copy_cv_t<_Tp, typename __make_unsigned<__remove_cv_ #endif // __has_builtin(__make_unsigned) template -struct make_unsigned { +struct _LIBCPP_NO_SPECIALIZATIONS make_unsigned { using type _LIBCPP_NODEBUG = __make_unsigned_t<_Tp>; }; diff --git a/libcxx/include/__type_traits/negation.h b/libcxx/include/__type_traits/negation.h index a72e62d3f96e0c..a745a999a8bfbd 100644 --- a/libcxx/include/__type_traits/negation.h +++ b/libcxx/include/__type_traits/negation.h @@ -23,9 +23,9 @@ struct _Not : _BoolConstant {}; #if _LIBCPP_STD_VER >= 17 template -struct negation : _Not<_Tp> {}; +struct _LIBCPP_NO_SPECIALIZATIONS negation : _Not<_Tp> {}; template -inline constexpr bool negation_v = !_Tp::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool negation_v = !_Tp::value; #endif // _LIBCPP_STD_VER >= 17 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/rank.h b/libcxx/include/__type_traits/rank.h index aeeedec40dee56..1745660ce5ef7e 100644 --- a/libcxx/include/__type_traits/rank.h +++ b/libcxx/include/__type_traits/rank.h @@ -28,17 +28,23 @@ struct rank : integral_constant {}; #else template -struct _LIBCPP_TEMPLATE_VIS rank : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS _LIBCPP_NO_SPECIALIZATIONS rank : public integral_constant {}; + +_LIBCPP_DIAGNOSTIC_PUSH +# if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +# endif template struct _LIBCPP_TEMPLATE_VIS rank<_Tp[]> : public integral_constant::value + 1> {}; template struct _LIBCPP_TEMPLATE_VIS rank<_Tp[_Np]> : public integral_constant::value + 1> {}; +_LIBCPP_DIAGNOSTIC_POP #endif // __has_builtin(__array_rank) #if _LIBCPP_STD_VER >= 17 template -inline constexpr size_t rank_v = rank<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr size_t rank_v = rank<_Tp>::value; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/remove_all_extents.h b/libcxx/include/__type_traits/remove_all_extents.h index d46a3228b4ab5d..cb40ea4e4e1e09 100644 --- a/libcxx/include/__type_traits/remove_all_extents.h +++ b/libcxx/include/__type_traits/remove_all_extents.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__remove_all_extents) template -struct remove_all_extents { +struct _LIBCPP_NO_SPECIALIZATIONS remove_all_extents { using type _LIBCPP_NODEBUG = __remove_all_extents(_Tp); }; diff --git a/libcxx/include/__type_traits/remove_const.h b/libcxx/include/__type_traits/remove_const.h index 6250d9f531170b..37315c5229ee84 100644 --- a/libcxx/include/__type_traits/remove_const.h +++ b/libcxx/include/__type_traits/remove_const.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__remove_const) template -struct remove_const { +struct _LIBCPP_NO_SPECIALIZATIONS remove_const { using type _LIBCPP_NODEBUG = __remove_const(_Tp); }; diff --git a/libcxx/include/__type_traits/remove_cv.h b/libcxx/include/__type_traits/remove_cv.h index 16848e6d71128a..0beaf3367ecba1 100644 --- a/libcxx/include/__type_traits/remove_cv.h +++ b/libcxx/include/__type_traits/remove_cv.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct remove_cv { +struct _LIBCPP_NO_SPECIALIZATIONS remove_cv { using type _LIBCPP_NODEBUG = __remove_cv(_Tp); }; diff --git a/libcxx/include/__type_traits/remove_cvref.h b/libcxx/include/__type_traits/remove_cvref.h index 25ee853aaf2fc0..760deddca946d1 100644 --- a/libcxx/include/__type_traits/remove_cvref.h +++ b/libcxx/include/__type_traits/remove_cvref.h @@ -36,7 +36,7 @@ using __is_same_uncvref _LIBCPP_NODEBUG = _IsSame<__remove_cvref_t<_Tp>, __remov #if _LIBCPP_STD_VER >= 20 template -struct remove_cvref { +struct _LIBCPP_NO_SPECIALIZATIONS remove_cvref { using type _LIBCPP_NODEBUG = __remove_cvref(_Tp); }; diff --git a/libcxx/include/__type_traits/remove_extent.h b/libcxx/include/__type_traits/remove_extent.h index 95a7971d7a9c22..636e392c9b1073 100644 --- a/libcxx/include/__type_traits/remove_extent.h +++ b/libcxx/include/__type_traits/remove_extent.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__remove_extent) template -struct remove_extent { +struct _LIBCPP_NO_SPECIALIZATIONS remove_extent { using type _LIBCPP_NODEBUG = __remove_extent(_Tp); }; diff --git a/libcxx/include/__type_traits/remove_pointer.h b/libcxx/include/__type_traits/remove_pointer.h index 47cd1cd1d80fae..2f7ac151e9376c 100644 --- a/libcxx/include/__type_traits/remove_pointer.h +++ b/libcxx/include/__type_traits/remove_pointer.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if !defined(_LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS) && __has_builtin(__remove_pointer) template -struct remove_pointer { +struct _LIBCPP_NO_SPECIALIZATIONS remove_pointer { using type _LIBCPP_NODEBUG = __remove_pointer(_Tp); }; diff --git a/libcxx/include/__type_traits/remove_reference.h b/libcxx/include/__type_traits/remove_reference.h index f68815691ac0f9..7cc3ca1705de2d 100644 --- a/libcxx/include/__type_traits/remove_reference.h +++ b/libcxx/include/__type_traits/remove_reference.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__remove_reference_t) template -struct remove_reference { +struct _LIBCPP_NO_SPECIALIZATIONS remove_reference { using type _LIBCPP_NODEBUG = __remove_reference_t(_Tp); }; diff --git a/libcxx/include/__type_traits/remove_volatile.h b/libcxx/include/__type_traits/remove_volatile.h index 099945df012418..5d73f7e5d241cd 100644 --- a/libcxx/include/__type_traits/remove_volatile.h +++ b/libcxx/include/__type_traits/remove_volatile.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if __has_builtin(__remove_volatile) template -struct remove_volatile { +struct _LIBCPP_NO_SPECIALIZATIONS remove_volatile { using type _LIBCPP_NODEBUG = __remove_volatile(_Tp); }; diff --git a/libcxx/include/__type_traits/type_identity.h b/libcxx/include/__type_traits/type_identity.h index b0b5a1277d5962..f526aef4d5b2ff 100644 --- a/libcxx/include/__type_traits/type_identity.h +++ b/libcxx/include/__type_traits/type_identity.h @@ -27,7 +27,7 @@ using __type_identity_t _LIBCPP_NODEBUG = typename __type_identity<_Tp>::type; #if _LIBCPP_STD_VER >= 20 template -struct type_identity { +struct _LIBCPP_NO_SPECIALIZATIONS type_identity { typedef _Tp type; }; template diff --git a/libcxx/include/__type_traits/underlying_type.h b/libcxx/include/__type_traits/underlying_type.h index 16e7501dee17df..45a9b40e3e4c9b 100644 --- a/libcxx/include/__type_traits/underlying_type.h +++ b/libcxx/include/__type_traits/underlying_type.h @@ -30,7 +30,7 @@ struct __underlying_type_impl<_Tp, true> { }; template -struct underlying_type : __underlying_type_impl<_Tp, is_enum<_Tp>::value> {}; +struct _LIBCPP_NO_SPECIALIZATIONS underlying_type : __underlying_type_impl<_Tp, is_enum<_Tp>::value> {}; #if _LIBCPP_STD_VER >= 14 template diff --git a/libcxx/include/__type_traits/unwrap_ref.h b/libcxx/include/__type_traits/unwrap_ref.h index 11a069d6630239..e8ca2b5028f6c7 100644 --- a/libcxx/include/__type_traits/unwrap_ref.h +++ b/libcxx/include/__type_traits/unwrap_ref.h @@ -34,13 +34,13 @@ using __unwrap_ref_decay_t _LIBCPP_NODEBUG = typename __unwrap_reference<__decay #if _LIBCPP_STD_VER >= 20 template -struct unwrap_reference : __unwrap_reference<_Tp> {}; +struct _LIBCPP_NO_SPECIALIZATIONS unwrap_reference : __unwrap_reference<_Tp> {}; template using unwrap_reference_t = typename unwrap_reference<_Tp>::type; template -struct unwrap_ref_decay : unwrap_reference<__decay_t<_Tp> > {}; +struct _LIBCPP_NO_SPECIALIZATIONS unwrap_ref_decay : unwrap_reference<__decay_t<_Tp> > {}; template using unwrap_ref_decay_t = __unwrap_ref_decay_t<_Tp>; diff --git a/libcxx/include/execution b/libcxx/include/execution index 7c695997cd0479..5b1915a998732b 100644 --- a/libcxx/include/execution +++ b/libcxx/include/execution @@ -96,6 +96,10 @@ inline constexpr unsequenced_policy unseq{__disable_user_instantiations_tag{}}; } // namespace execution +_LIBCPP_DIAGNOSTIC_PUSH +# if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +# endif template <> inline constexpr bool is_execution_policy_v = true; @@ -107,6 +111,7 @@ inline constexpr bool is_execution_policy_v inline constexpr bool is_execution_policy_v = true; +_LIBCPP_DIAGNOSTIC_POP template <> inline constexpr bool __is_parallel_execution_policy_impl = true; @@ -121,8 +126,13 @@ template <> inline constexpr bool __is_unsequenced_execution_policy_impl = true; # if _LIBCPP_STD_VER >= 20 +_LIBCPP_DIAGNOSTIC_PUSH +# if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +# endif template <> inline constexpr bool is_execution_policy_v = true; +_LIBCPP_DIAGNOSTIC_POP template <> inline constexpr bool __is_unsequenced_execution_policy_impl = true; @@ -130,7 +140,7 @@ inline constexpr bool __is_unsequenced_execution_policy_impl -struct is_execution_policy : bool_constant> {}; +struct _LIBCPP_NO_SPECIALIZATIONS is_execution_policy : bool_constant> {}; template _LIBCPP_HIDE_FROM_ABI auto __remove_parallel_policy(const _ExecutionPolicy&) { diff --git a/libcxx/include/variant b/libcxx/include/variant index 6c7be7f8f1eb5d..3786d9524020b8 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -1156,7 +1156,7 @@ visit(_Visitor&& __visitor, _Vs&&... __vs); # endif template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DECLSPEC_EMPTY_BASES variant +class _LIBCPP_TEMPLATE_VIS _LIBCPP_DECLSPEC_EMPTY_BASES _LIBCPP_NO_SPECIALIZATIONS variant : private __sfinae_ctor_base< __all...>::value, __all...>::value>, private __sfinae_assign_base< diff --git a/libcxx/test/libcxx/algorithms/no_specializations.verify.cpp b/libcxx/test/libcxx/algorithms/no_specializations.verify.cpp new file mode 100644 index 00000000000000..5b2475252b6023 --- /dev/null +++ b/libcxx/test/libcxx/algorithms/no_specializations.verify.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// UNSUPPORTED: libcpp-has-no-incomplete-pstl + +// Check that user-specializations are diagnosed +// See [execpol.type]/3 + +#include + +#if !__has_warning("-Winvalid-specializations") +// expected-no-diagnostics +#else +struct S {}; + +template <> +struct std::is_execution_policy; // expected-error {{cannot be specialized}} + +template <> +constexpr bool std::is_execution_policy_v = false; // expected-error {{cannot be specialized}} +#endif diff --git a/libcxx/test/libcxx/language.support/no_specializations.verify.cpp b/libcxx/test/libcxx/language.support/no_specializations.verify.cpp new file mode 100644 index 00000000000000..b7b75b7e9a00d1 --- /dev/null +++ b/libcxx/test/libcxx/language.support/no_specializations.verify.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// Check that user-specializations are diagnosed +// See [cmp.result]/1 + +#include + +#if !__has_warning("-Winvalid-specialization") +// expected-no-diagnostics +#else +struct S {}; + +template <> +struct std::compare_three_way_result; // expected-error {{cannot be specialized}} +#endif diff --git a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp new file mode 100644 index 00000000000000..69d458a9205583 --- /dev/null +++ b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// Check that user-specializations are diagnosed +// See [range.adaptor.object]/5 + +#include + +#if !__has_warning("-Winvalid-specialization") +// expected-no-diagnostics +#else +struct S {}; + +template <> +class std::ranges::range_adaptor_closure; // expected-error {{cannot be specialized}} +#endif diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp new file mode 100644 index 00000000000000..e6d960667e8c0c --- /dev/null +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -0,0 +1,176 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Check that user-specializations are diagnosed +// See [meta.rqmts]/4, [meta.trans.other]/5, [meta.trans.other]/7 + +#include + +#include "test_macros.h" + +#if !__has_warning("-Winvalid-specialization") +// expected-no-diagnostics +#else +struct S {}; + +# define SPECIALIZE_TRAIT(Trait) \ + template <> \ + struct std::Trait + +SPECIALIZE_TRAIT(add_const); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(add_cv); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(add_volatile); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(add_lvalue_reference); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(add_rvalue_reference); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(add_pointer); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(decay); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(invoke_result); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(make_unsigned); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_all_extents); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_const); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_cv); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_cvref); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_extent); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_pointer); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_reference); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_volatile); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(type_identity); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(underlying_type); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}} + +# undef SPECIALIZE_TRAIT +# define SPECIALIZE_UTT(Trait) \ + template <> \ + struct std::Trait; \ + template <> \ + inline constexpr bool std::Trait##_v = false + +# define SPECIALIZE_BTT(Trait) \ + template <> \ + struct std::Trait; \ + template <> \ + inline constexpr bool std::Trait##_v = false + +SPECIALIZE_UTT(alignment_of); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(conjunction); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(disjunction); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(extent); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(has_unique_object_representations); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_abstract); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_aggregate); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_arithmetic); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_array); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_base_of); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_class); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_compound); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_const); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_convertible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_copy_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_copy_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_default_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_destructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_empty); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_enum); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_final); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_floating_point); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_function); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_fundamental); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_integral); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_invocable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_invocable_r); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_lvalue_reference); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_member_pointer); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_member_object_pointer); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_member_function_pointer); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_move_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_move_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_nothrow_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_copy_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_copy_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_default_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_destructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_move_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_move_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_invocable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_nothrow_invocable_r); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_nothrow_swappable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_nothrow_swappable_with); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_null_pointer); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_object); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_pod); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_pointer); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_polymorphic); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_reference); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_rvalue_reference); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_same); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_scalar); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_signed); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_standard_layout); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_swappable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_swappable_with); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivial); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_trivially_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_copy_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_copy_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_copyable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_default_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_destructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_move_assignable); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_trivially_move_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_union); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_unsigned); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_void); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_volatile); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(negation); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(rank); // expected-error 2 {{cannot be specialized}} + +# if TEST_STD_VER <= 17 +SPECIALIZE_UTT(is_literal_type); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(result_of); // expected-error 2 {{cannot be specialized}} +# endif + +# if TEST_STD_VER >= 20 +SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}} +# endif + +# if TEST_STD_VER >= 23 +SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} +# endif + +# if TEST_STD_VER >= 26 +SPECIALIZE_BTT(is_virtual_base_of); // expected-error 2 {{cannot be specialized}} +# endif + +# undef SPECIALIZE_UTT +# undef SPECIALIZE_BTT + +template <> +struct std::aligned_storage<1, 3>; // expected-error {{cannot be specialized}} + +template <> +struct std::aligned_union<1, S>; // expected-error {{cannot be specialized}} + +template <> +struct std::conditional; // expected-error {{cannot be specialized}} + +template <> +struct std::enable_if; // expected-error {{cannot be specialized}} + +template <> +struct std::integral_constant; // expected-error {{cannot be specialized}} +#endif diff --git a/libcxx/test/libcxx/utilities/format/no_specializations.verify.cpp b/libcxx/test/libcxx/utilities/format/no_specializations.verify.cpp new file mode 100644 index 00000000000000..e1acbaf16f5b4d --- /dev/null +++ b/libcxx/test/libcxx/utilities/format/no_specializations.verify.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// Check that user-specializations are diagnosed +// See [format.arg]/2 + +#include + +#if !__has_warning("-Winvalid-specialization") +// expected-no-diagnostics +#else +struct S {}; + +template <> +class std::basic_format_arg; // expected-error {{cannot be specialized}} +#endif diff --git a/libcxx/test/libcxx/utilities/no_specializations.verify.cpp b/libcxx/test/libcxx/utilities/no_specializations.verify.cpp new file mode 100644 index 00000000000000..d4743f4fd3f908 --- /dev/null +++ b/libcxx/test/libcxx/utilities/no_specializations.verify.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Check that user-specializations are diagnosed +// See [variant.variant.general]/4 + +#include + +#if !__has_warning("-Winvalid-specialization") +// expected-no-diagnostics +#else +struct S {}; + +template <> +class std::variant; // expected-error {{cannot be specialized}} +#endif From 9705500582b9c2b2e1dd6de14f03a94d270a9250 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 23 Jan 2025 12:24:16 +0000 Subject: [PATCH 131/208] [libclc] Move nextafter to the CLC library (#124097) There were two implementations of this - one that implemented nextafter in software, and another that called a clang builtin. No in-tree targets called the builtin, so all targets build the software version. The builtin version has been removed, and the software version has been renamed to be the "default". This commit also optimizes nextafter, to avoid scalarization as much as possible. Note however that the (CLC) relational builtins still scalarize; those will be optimized in a separate commit. Since nextafter is used by some convert_type builtins, the diff to IR codegen is not limited to the builtin itself. --- libclc/amdgpu/lib/SOURCES | 1 - libclc/amdgpu/lib/math/nextafter.cl | 15 ----- libclc/clc/include/clc/clcmacro.h | 28 +++++++++ .../binary_decl_with_scalar_second_arg.inc | 4 ++ libclc/clc/include/clc/math/clc_nextafter.h | 12 ++++ libclc/clc/include/clc/relational/clc_isnan.h | 7 --- libclc/clc/include/clc/shared/binary_decl.inc | 2 + libclc/clc/lib/clspv/SOURCES | 1 + libclc/clc/lib/generic/SOURCES | 1 + libclc/clc/lib/generic/math/clc_nextafter.cl | 62 +++++++++++++++++++ libclc/clc/lib/spirv/SOURCES | 1 + libclc/clc/lib/spirv64/SOURCES | 1 + libclc/clspv/lib/SOURCES | 3 +- libclc/clspv/lib/math/nextafter.cl | 5 -- libclc/clspv/lib/math/nextafter.inc | 3 - .../generic/include/clc/math/binary_decl.inc | 2 - libclc/generic/include/clc/math/fmax.h | 2 +- libclc/generic/include/clc/math/fmin.h | 2 +- libclc/generic/include/math/clc_nextafter.h | 7 --- libclc/generic/lib/SOURCES | 1 - libclc/generic/lib/math/clc_nextafter.cl | 49 --------------- libclc/generic/lib/math/nextafter.cl | 18 +++++- libclc/ptx/lib/SOURCES | 1 - libclc/ptx/lib/math/nextafter.cl | 10 --- 24 files changed, 130 insertions(+), 108 deletions(-) delete mode 100644 libclc/amdgpu/lib/math/nextafter.cl create mode 100644 libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc create mode 100644 libclc/clc/include/clc/math/clc_nextafter.h create mode 100644 libclc/clc/include/clc/shared/binary_decl.inc create mode 100644 libclc/clc/lib/generic/math/clc_nextafter.cl delete mode 100644 libclc/clspv/lib/math/nextafter.cl delete mode 100644 libclc/clspv/lib/math/nextafter.inc delete mode 100644 libclc/generic/include/clc/math/binary_decl.inc delete mode 100644 libclc/generic/include/math/clc_nextafter.h delete mode 100644 libclc/generic/lib/math/clc_nextafter.cl delete mode 100644 libclc/ptx/lib/SOURCES delete mode 100644 libclc/ptx/lib/math/nextafter.cl diff --git a/libclc/amdgpu/lib/SOURCES b/libclc/amdgpu/lib/SOURCES index b11cbdecf27b9d..24f099d049cd34 100644 --- a/libclc/amdgpu/lib/SOURCES +++ b/libclc/amdgpu/lib/SOURCES @@ -10,5 +10,4 @@ math/half_log2.cl math/half_recip.cl math/half_rsqrt.cl math/half_sqrt.cl -math/nextafter.cl math/sqrt.cl diff --git a/libclc/amdgpu/lib/math/nextafter.cl b/libclc/amdgpu/lib/math/nextafter.cl deleted file mode 100644 index 6dc117b8cdd64c..00000000000000 --- a/libclc/amdgpu/lib/math/nextafter.cl +++ /dev/null @@ -1,15 +0,0 @@ -#include -#include -#include - -_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DEFINE_BINARY_BUILTIN(double, nextafter, __clc_nextafter, double, double) -#endif - -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_DEFINE_BINARY_BUILTIN(half, nextafter, __clc_nextafter, half, half) -#endif diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h index 676560e9efcb44..14399811bad938 100644 --- a/libclc/clc/include/clc/clcmacro.h +++ b/libclc/clc/include/clc/clcmacro.h @@ -159,6 +159,34 @@ _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) +// FIXME: Make _CLC_DEFINE_BINARY_BUILTIN avoid scalarization by default, and +// introduce an explicit scalarizing version. +#define _CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(RET_TYPE, FUNCTION, BUILTIN, \ + ARG1_TYPE, ARG2_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ + return BUILTIN(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, \ + ARG2_TYPE##2 y) { \ + return BUILTIN(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, \ + ARG2_TYPE##3 y) { \ + return BUILTIN(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, \ + ARG2_TYPE##4 y) { \ + return BUILTIN(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, \ + ARG2_TYPE##8 y) { \ + return BUILTIN(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, \ + ARG2_TYPE##16 y) { \ + return BUILTIN(x, y); \ + } + #define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG( \ RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \ _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \ diff --git a/libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc b/libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc new file mode 100644 index 00000000000000..92b7b287f64bb7 --- /dev/null +++ b/libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc @@ -0,0 +1,4 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_GENTYPE b); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b); diff --git a/libclc/clc/include/clc/math/clc_nextafter.h b/libclc/clc/include/clc/math/clc_nextafter.h new file mode 100644 index 00000000000000..599c022c6682b0 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_nextafter.h @@ -0,0 +1,12 @@ +#ifndef __CLC_MATH_CLC_NEXTAFTER_H__ +#define __CLC_MATH_CLC_NEXTAFTER_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_nextafter + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_NEXTAFTER_H__ diff --git a/libclc/clc/include/clc/relational/clc_isnan.h b/libclc/clc/include/clc/relational/clc_isnan.h index 3200e593c5cff4..2483a1ab255787 100644 --- a/libclc/clc/include/clc/relational/clc_isnan.h +++ b/libclc/clc/include/clc/relational/clc_isnan.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISNAN_H__ #define __CLC_RELATIONAL_CLC_ISNAN_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isnan -#define __clc_isnan isnan -#else - #include #include @@ -37,6 +32,4 @@ _CLC_VECTOR_ISNAN_DECL(short, half) #undef _CLC_ISNAN_DECL #undef _CLC_VECTOR_ISNAN_DECL -#endif - #endif // __CLC_RELATIONAL_CLC_ISNAN_H__ diff --git a/libclc/clc/include/clc/shared/binary_decl.inc b/libclc/clc/include/clc/shared/binary_decl.inc new file mode 100644 index 00000000000000..50fd1df34fd245 --- /dev/null +++ b/libclc/clc/include/clc/shared/binary_decl.inc @@ -0,0 +1,2 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y); diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES index c3fc03c0b3dd5e..81f90a24d00d60 100644 --- a/libclc/clc/lib/clspv/SOURCES +++ b/libclc/clc/lib/clspv/SOURCES @@ -2,6 +2,7 @@ ../generic/math/clc_fabs.cl ../generic/math/clc_floor.cl ../generic/math/clc_mad.cl +../generic/math/clc_nextafter.cl ../generic/math/clc_rint.cl ../generic/math/clc_trunc.cl ../generic/relational/clc_select.cl diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index 877a0a390a7452..59dad8e8606891 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -8,6 +8,7 @@ math/clc_ceil.cl math/clc_fabs.cl math/clc_floor.cl math/clc_mad.cl +math/clc_nextafter.cl math/clc_rint.cl math/clc_trunc.cl relational/clc_all.cl diff --git a/libclc/clc/lib/generic/math/clc_nextafter.cl b/libclc/clc/lib/generic/math/clc_nextafter.cl new file mode 100644 index 00000000000000..58125485bf6845 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_nextafter.cl @@ -0,0 +1,62 @@ +#include +#include +#include + +// This file provides OpenCL C implementations of __clc_nextafter for +// targets that don't support the clang builtin. + +#define CLC_AS_TYPE(x) __clc_as_##x + +#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE, INT_TYPE_SCALAR) \ + _CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, \ + FLOAT_TYPE y) { \ + const UINT_TYPE sign_bit = (UINT_TYPE)1 \ + << (sizeof(INT_TYPE_SCALAR) * 8 - 1); \ + const UINT_TYPE sign_bit_mask = sign_bit - (UINT_TYPE)1; \ + INT_TYPE ix = CLC_AS_TYPE(INT_TYPE)(x); \ + UINT_TYPE ax = CLC_AS_TYPE(UINT_TYPE)(ix) & sign_bit_mask; \ + INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(sign_bit) - ix; \ + mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx : ix; \ + INT_TYPE iy = CLC_AS_TYPE(INT_TYPE)(y); \ + UINT_TYPE ay = CLC_AS_TYPE(UINT_TYPE)(iy) & sign_bit_mask; \ + INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(sign_bit) - iy; \ + my = iy < (INT_TYPE)0 ? my : iy; \ + INT_TYPE t = mx + (mx < my ? (INT_TYPE)1 : (INT_TYPE)-1); \ + INT_TYPE r = CLC_AS_TYPE(INT_TYPE)(sign_bit) - t; \ + r = t < (INT_TYPE)0 ? r : t; \ + r = __clc_isnan(x) ? ix : r; \ + r = __clc_isnan(y) ? CLC_AS_TYPE(INT_TYPE)(iy) : r; \ + r = ((ax | ay) == (UINT_TYPE)0 || ix == iy) ? iy : r; \ + return CLC_AS_TYPE(FLOAT_TYPE)(r); \ + } + +NEXTAFTER(float, uint, int, int) +NEXTAFTER(float2, uint2, int2, int) +NEXTAFTER(float3, uint3, int3, int) +NEXTAFTER(float4, uint4, int4, int) +NEXTAFTER(float8, uint8, int8, int) +NEXTAFTER(float16, uint16, int16, int) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +NEXTAFTER(double, ulong, long, long) +NEXTAFTER(double2, ulong2, long2, long) +NEXTAFTER(double3, ulong3, long3, long) +NEXTAFTER(double4, ulong4, long4, long) +NEXTAFTER(double8, ulong8, long8, long) +NEXTAFTER(double16, ulong16, long16, long) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +NEXTAFTER(half, ushort, short, short) +NEXTAFTER(half2, ushort2, short2, short) +NEXTAFTER(half3, ushort3, short3, short) +NEXTAFTER(half4, ushort4, short4, short) +NEXTAFTER(half8, ushort8, short8, short) +NEXTAFTER(half16, ushort16, short16, short) + +#endif diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES index 55d109478faac5..813b1e3d699378 100644 --- a/libclc/clc/lib/spirv/SOURCES +++ b/libclc/clc/lib/spirv/SOURCES @@ -6,6 +6,7 @@ ../generic/math/clc_fabs.cl ../generic/math/clc_floor.cl ../generic/math/clc_mad.cl +../generic/math/clc_nextafter.cl ../generic/math/clc_rint.cl ../generic/math/clc_trunc.cl ../generic/relational/clc_select.cl diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES index 55d109478faac5..813b1e3d699378 100644 --- a/libclc/clc/lib/spirv64/SOURCES +++ b/libclc/clc/lib/spirv64/SOURCES @@ -6,6 +6,7 @@ ../generic/math/clc_fabs.cl ../generic/math/clc_floor.cl ../generic/math/clc_mad.cl +../generic/math/clc_nextafter.cl ../generic/math/clc_rint.cl ../generic/math/clc_trunc.cl ../generic/relational/clc_select.cl diff --git a/libclc/clspv/lib/SOURCES b/libclc/clspv/lib/SOURCES index 7c369aa379e985..0d6091ce20e449 100644 --- a/libclc/clspv/lib/SOURCES +++ b/libclc/clspv/lib/SOURCES @@ -1,5 +1,4 @@ math/fma.cl -math/nextafter.cl shared/vstore_half.cl subnormal_config.cl ../../generic/lib/geometric/distance.cl @@ -21,7 +20,6 @@ subnormal_config.cl ../../generic/lib/math/clc_fmod.cl ../../generic/lib/math/clc_hypot.cl ../../generic/lib/math/clc_ldexp.cl -../../generic/lib/math/clc_nextafter.cl ../../generic/lib/math/clc_pow.cl ../../generic/lib/math/clc_pown.cl ../../generic/lib/math/clc_powr.cl @@ -71,6 +69,7 @@ subnormal_config.cl ../../generic/lib/math/minmag.cl ../../generic/lib/math/modf.cl ../../generic/lib/math/nan.cl +../../generic/lib/math/nextafter.cl ../../generic/lib/math/pow.cl ../../generic/lib/math/pown.cl ../../generic/lib/math/powr.cl diff --git a/libclc/clspv/lib/math/nextafter.cl b/libclc/clspv/lib/math/nextafter.cl deleted file mode 100644 index f05e7482136203..00000000000000 --- a/libclc/clspv/lib/math/nextafter.cl +++ /dev/null @@ -1,5 +0,0 @@ -#include -#include - -#define __CLC_BODY -#include diff --git a/libclc/clspv/lib/math/nextafter.inc b/libclc/clspv/lib/math/nextafter.inc deleted file mode 100644 index ee39be53b1e17c..00000000000000 --- a/libclc/clspv/lib/math/nextafter.inc +++ /dev/null @@ -1,3 +0,0 @@ -_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE nextafter(__CLC_GENTYPE x, __CLC_GENTYPE y) { - return __clc_nextafter(x, y); -} diff --git a/libclc/generic/include/clc/math/binary_decl.inc b/libclc/generic/include/clc/math/binary_decl.inc deleted file mode 100644 index 41f07c3b014b7e..00000000000000 --- a/libclc/generic/include/clc/math/binary_decl.inc +++ /dev/null @@ -1,2 +0,0 @@ -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b); diff --git a/libclc/generic/include/clc/math/fmax.h b/libclc/generic/include/clc/math/fmax.h index 71ee859be4f944..2bb475851bae97 100644 --- a/libclc/generic/include/clc/math/fmax.h +++ b/libclc/generic/include/clc/math/fmax.h @@ -1,4 +1,4 @@ -#define __CLC_BODY +#define __CLC_BODY #define __CLC_FUNCTION fmax #include diff --git a/libclc/generic/include/clc/math/fmin.h b/libclc/generic/include/clc/math/fmin.h index d45f572e08b026..71154cc67b99b0 100644 --- a/libclc/generic/include/clc/math/fmin.h +++ b/libclc/generic/include/clc/math/fmin.h @@ -1,4 +1,4 @@ -#define __CLC_BODY +#define __CLC_BODY #define __CLC_FUNCTION fmin #include diff --git a/libclc/generic/include/math/clc_nextafter.h b/libclc/generic/include/math/clc_nextafter.h deleted file mode 100644 index 2b674b7079568f..00000000000000 --- a/libclc/generic/include/math/clc_nextafter.h +++ /dev/null @@ -1,7 +0,0 @@ -#define __CLC_BODY -#define __CLC_FUNCTION __clc_nextafter - -#include - -#undef __CLC_BODY -#undef __CLC_FUNCTION diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index 579e909e53d462..217e3bca48b7ac 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -159,7 +159,6 @@ math/native_sin.cl math/native_sqrt.cl math/native_tan.cl math/tables.cl -math/clc_nextafter.cl math/nextafter.cl math/clc_pow.cl math/pow.cl diff --git a/libclc/generic/lib/math/clc_nextafter.cl b/libclc/generic/lib/math/clc_nextafter.cl deleted file mode 100644 index 623eb11c18c3b1..00000000000000 --- a/libclc/generic/lib/math/clc_nextafter.cl +++ /dev/null @@ -1,49 +0,0 @@ -#include -#include -#include - -// This file provides OpenCL C implementations of nextafter for -// targets that don't support the clang builtin. - -#define AS_TYPE(x) as_##x - -#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE) \ - _CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, \ - FLOAT_TYPE y) { \ - const UINT_TYPE sign_bit = (UINT_TYPE)1 << (sizeof(INT_TYPE) * 8 - 1); \ - const UINT_TYPE sign_bit_mask = sign_bit - 1; \ - INT_TYPE ix = AS_TYPE(INT_TYPE)(x); \ - INT_TYPE ax = ix & sign_bit_mask; \ - INT_TYPE mx = sign_bit - ix; \ - mx = ix < 0 ? mx : ix; \ - INT_TYPE iy = AS_TYPE(INT_TYPE)(y); \ - INT_TYPE ay = iy & sign_bit_mask; \ - INT_TYPE my = sign_bit - iy; \ - my = iy < 0 ? my : iy; \ - INT_TYPE t = mx + (mx < my ? 1 : -1); \ - INT_TYPE r = sign_bit - t; \ - r = t < 0 ? r : t; \ - r = __clc_isnan(x) ? ix : r; \ - r = __clc_isnan(y) ? iy : r; \ - r = ((ax | ay) == 0 | ix == iy) ? iy : r; \ - return AS_TYPE(FLOAT_TYPE)(r); \ - } - -NEXTAFTER(float, uint, int) -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_nextafter, float, - float) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -NEXTAFTER(double, ulong, long) -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_nextafter, double, - double) -#endif - -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -NEXTAFTER(half, ushort, short) -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_nextafter, half, half) -#endif diff --git a/libclc/generic/lib/math/nextafter.cl b/libclc/generic/lib/math/nextafter.cl index cbe54cd4e2662a..e0a7b4e457dc06 100644 --- a/libclc/generic/lib/math/nextafter.cl +++ b/libclc/generic/lib/math/nextafter.cl @@ -1,12 +1,24 @@ #include -#include "../clcmacro.h" +#include +#include -_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __builtin_nextafterf, float, float) +_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(float, nextafter, __clc_nextafter, + float, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DEFINE_BINARY_BUILTIN(double, nextafter, __builtin_nextafter, double, double) +_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(double, nextafter, __clc_nextafter, + double, double) + +#endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(half, nextafter, __clc_nextafter, half, + half) #endif diff --git a/libclc/ptx/lib/SOURCES b/libclc/ptx/lib/SOURCES deleted file mode 100644 index ff5bff2cea5445..00000000000000 --- a/libclc/ptx/lib/SOURCES +++ /dev/null @@ -1 +0,0 @@ -math/nextafter.cl diff --git a/libclc/ptx/lib/math/nextafter.cl b/libclc/ptx/lib/math/nextafter.cl deleted file mode 100644 index 809eecac53d8c4..00000000000000 --- a/libclc/ptx/lib/math/nextafter.cl +++ /dev/null @@ -1,10 +0,0 @@ -#include -#include -#include - -_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DEFINE_BINARY_BUILTIN(double, nextafter, __clc_nextafter, double, double) -#endif From e069518f82bc3699dc4fc81bbc99ae4a6d44449e Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 23 Jan 2025 12:28:30 +0000 Subject: [PATCH 132/208] SCEV: cover a codepath in isImpliedCondBalancedTypes (#123070) The code that checks a predicate against a swapped predicate in isImpliedCondBalancedTypes is not covered by any existing test, within any Analysis or Transform. Fix this by adding a test to SCEV. --- .../ScalarEvolution/implied-via-division.ll | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll index fbe69b4b188977..a1d30406095ec5 100644 --- a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll +++ b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll @@ -411,3 +411,56 @@ header: exit: ret void } + +define void @swapped_predicate(i32 %n) { +; Prove that (n s>= 1) ===> (0 s>= -n / 2). +; CHECK-LABEL: 'swapped_predicate' +; CHECK-NEXT: Determining loop execution counts for: @swapped_predicate +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; +entry: + %cmp1 = icmp sge i32 %n, 1 + %n.div.2 = sdiv i32 %n, 2 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %minus.indvar = sub nsw i32 0, %indvar + %minus.n.div.2 = sub nsw i32 0, %n.div.2 + %exitcond = icmp sge i32 %minus.indvar, %minus.n.div.2 + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} + +define void @swapped_predicate_neg(i32 %n) { +; Prove that (n s>= 1) =\=> (-n / 2 s>= 0). +; CHECK-LABEL: 'swapped_predicate_neg' +; CHECK-NEXT: Determining loop execution counts for: @swapped_predicate_neg +; CHECK-NEXT: Loop %header: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. +; +entry: + %cmp1 = icmp sge i32 %n, 1 + %n.div.2 = sdiv i32 %n, 2 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %minus.indvar = sub nsw i32 0, %indvar + %minus.n.div.2 = sub nsw i32 0, %n.div.2 + %exitcond = icmp sge i32 %minus.n.div.2, %minus.indvar + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} From 0e944a30954e666cba2bf17497fafe835e4b3519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= Date: Thu, 23 Jan 2025 14:47:36 +0200 Subject: [PATCH 133/208] [SCFToGPU] Convert scf.parallel+scf.reduce to gpu.all_reduce (#122782) Support reductions in SCFToGPU: `scf.parallel` and `scf.reduce` op combination is now converted to a `gpu.all_reduce` op. --- mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp | 36 ++- .../Conversion/SCFToGPU/parallel_loop.mlir | 213 ++++++++++++++++++ 2 files changed, 247 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index dece254c325fcd..1ac95ebcdc87f3 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -408,8 +408,8 @@ static LogicalResult processParallelLoop( ArrayAttr mapping = parallelOp->getAttrOfType(gpu::getMappingAttrName()); - // TODO: Support reductions. - if (!mapping || parallelOp.getNumResults() != 0) + // TODO: Support multiple reductions. + if (!mapping || parallelOp.getNumResults() > 1) return failure(); Location loc = parallelOp.getLoc(); @@ -556,6 +556,11 @@ static LogicalResult processParallelLoop( Block *body = parallelOp.getBody(); worklist.reserve(worklist.size() + body->getOperations().size()); + // Include scf.reduce terminator if exists and has an operand. + if (auto terminator = body->getTerminator(); + isa(terminator) && terminator->getOperands().size() == 1) { + worklist.push_back(terminator); + } for (Operation &op : llvm::reverse(body->without_terminator())) worklist.push_back(&op); return success(); @@ -648,6 +653,33 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, rewriter.setInsertionPointAfter(parent); leftNestingScope = true; seenSideeffects = false; + } else if (auto reduceOp = dyn_cast(op)) { + // Convert scf.reduction op + auto parentLoop = op->getParentOfType(); + if (!parentLoop || op->getOperands().size() != 1) + return failure(); + auto operand = op->getOperands().front(); + auto newValue = cloningMap.lookupOrNull(operand); + if (!newValue || !operand.getType().isSignlessIntOrFloat()) + return failure(); + // Ensure reduction region is isolated from above. + llvm::SetVector externalValues; + getUsedValuesDefinedAbove(reduceOp.getRegion(0), externalValues); + if (externalValues.size()) + return failure(); + // Replace by gpu.all_reduce. + auto gpuRedOp = rewriter.create(loc, newValue); + cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult()); + // Copy region. + rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(), + gpuRedOp.getRegion().begin()); + // Replace src.reduce.return with gpu.yield. + auto scfReturn = gpuRedOp.getRegion().front().getTerminator(); + auto ip = rewriter.saveInsertionPoint(); + rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front()); + rewriter.replaceOpWithNewOp( + scfReturn, scfReturn->getOperands().front()); + rewriter.restoreInsertionPoint(ip); } else { // Otherwise we copy it over. Operation *clone = rewriter.clone(*op, cloningMap); diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir index 59441e5ed66290..1dbce05be85b49 100644 --- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir @@ -428,3 +428,216 @@ func.func @step_invariant() { // CHECK: %[[rhs:.*]] = memref.load %[[alloc_1]][%[[dim0]], %[[dim1]]] : memref<1x1xf64> // CHECK: %[[sum:.*]] = arith.addf %[[lhs]], %[[rhs]] : f64 // CHECK: memref.store %[[sum]], %[[alloc_0]][%[[dim0]], %[[dim1]]] : memref<1x1xf64> + +// ----- + +// 1-d parallel reduction mapped to block.x and thread.x. + +// CHECK-LABEL: @parallel_reduction_1d +func.func @parallel_reduction_1d() { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<64xf32> + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 { + %1 = memref.load %alloc_0[%arg2] : memref<64xf32> + scf.reduce(%1 : f32) { + ^bb0(%arg3: f32, %arg4: f32): + %2 = arith.addf %arg3, %arg4 : f32 + scf.reduce.return %2 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<64xf32> + return +} + +// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref +// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32> +// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: gpu.launch +// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]]] : memref<64xf32> +// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { +// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): +// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 +// CHECK-NEXT: gpu.yield %[[sum]] : f32 +// CHECK-NEXT: } : (f32) -> f32 +// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref + +// ----- + +// 2-d parallel reduction mapped to block.x and thread.x and thread.y. + +// CHECK-LABEL: @parallel_reduction_2d +func.func @parallel_reduction_2d() { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<8x8xf32> + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c8, %c8) step (%c1, %c1) init (%cst) -> f32 { + %1 = memref.load %alloc_0[%arg2, %arg3] : memref<8x8xf32> + scf.reduce(%1 : f32) { + ^bb0(%arg4: f32, %arg5: f32): + %2 = arith.addf %arg4, %arg5 : f32 + scf.reduce.return %2 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<8x8xf32> + return +} + +// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref +// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<8x8xf32> +// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_2:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: gpu.launch +// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-SAME: threads(%[[arg_3:.*]], %[[arg_4:.*]], %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %[[map_2]], %{{[^)]*}} = %{{[^)]*}}) +// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_4]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]], %[[dim2]]] : memref<8x8xf32> +// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { +// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): +// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 +// CHECK-NEXT: gpu.yield %[[sum]] : f32 +// CHECK-NEXT: } : (f32) -> f32 +// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref + +// ----- + +// tiled 1-d parallel reduction mapped to block.x and thread.x. + +// CHECK-LABEL: @parallel_reduction_1d_tiled +func.func @parallel_reduction_1d_tiled() { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %alloc_0 = memref.alloc() : memref<8192xf32> + %alloc_1 = memref.alloc() : memref<64xf32> + scf.parallel (%arg1) = (%c0) to (%c64) step (%c1) { + %subview = memref.subview %alloc_1[%arg1] [1] [1] : memref<64xf32> to memref> + %0 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1) + %subview_1 = memref.subview %alloc_0[%0] [128] [1] : memref<8192xf32> to memref<128xf32, strided<[1], offset: ?>> + %1 = scf.parallel (%arg2) = (%c0) to (%c128) step (%c1) init (%cst) -> f32 { + %2 = memref.load %subview_1[%arg2] : memref<128xf32, strided<[1], offset: ?>> + scf.reduce(%2 : f32) { + ^bb0(%arg3: f32, %arg4: f32): + %3 = arith.addf %arg3, %arg4 : f32 + scf.reduce.return %3 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %1, %subview[] : memref> + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc_0 : memref<8192xf32> + memref.dealloc %alloc_1 : memref<64xf32> + return +} + +// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<8192xf32> +// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32> +// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: gpu.launch +// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dst:.*]] = memref.subview %[[alloc_1]][%[[dim0]]] [1] [1] : memref<64xf32> +// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map2(%[[dim0]]) +// CHECK-NEXT: %[[tile:.*]] = memref.subview %[[alloc_0]][%[[dim1]]] [128] [1] : memref<8192xf32> +// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[src:.*]] = memref.load %[[tile]][%[[dim2]]] : memref<128xf32, strided<[1], offset: ?>> +// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { +// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): +// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 +// CHECK-NEXT: gpu.yield %[[sum]] : f32 +// CHECK-NEXT: } : (f32) -> f32 +// CHECK-NEXT: memref.store %[[res]], %[[dst]][] : memref> + +// ----- + +// 1-d parallel reduction, unsigned int. Cannot be mapped. + +// CHECK-LABEL: @parallel_reduction_1d_uint +func.func @parallel_reduction_1d_uint(%cst : ui32) { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<64xui32> + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> ui32 { + %1 = memref.load %alloc_0[%arg2] : memref<64xui32> + scf.reduce(%1 : ui32) { + ^bb0(%arg3: ui32, %arg4: ui32): + scf.reduce.return %arg3 : ui32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<64xui32> + return +} + +// CHECK: scf.parallel +// CHECK-NEXT: scf.parallel +// CHECK: scf.reduce + +// ----- + +// 1-d parallel reduction, not isolated from above. Cannot be mapped. + +// CHECK-LABEL: @parallel_reduction_1d_outside +func.func @parallel_reduction_1d_outside() { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<64xf32> + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %const = arith.constant 1.000000e+00 : f32 + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 { + %1 = memref.load %alloc_0[%arg2] : memref<64xf32> + scf.reduce(%1 : f32) { + ^bb0(%arg3: f32, %arg4: f32): + %2 = arith.addf %arg3, %arg4 : f32 + %3 = arith.addf %2, %const : f32 + scf.reduce.return %3 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<64xf32> + return +} + +// CHECK: scf.parallel +// CHECK-NEXT: scf.parallel +// CHECK: scf.reduce From 90e9895a9373b3d83eefe15b34d2dc83c7bcc88f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 23 Jan 2025 12:59:59 +0000 Subject: [PATCH 134/208] [X86] Handle BSF/BSR "zero-input pass through" behaviour (#123623) Intel docs have been updated to be similar to AMD and now describe BSF/BSR as not changing the destination register if the input value was zero, which allows us to support CTTZ/CTLZ zero-input cases by setting the destination to support a NumBits result (BSR is a bit messy as it has to be XOR'd to create a CTLZ result). VIA/Zhaoxin x86_64 CPUs have also been confirmed to match this behaviour. This patch adjusts the X86ISD::BSF/BSR nodes to take a "pass through" argument for zero-input cases, by default this is set to UNDEF to match existing behaviour, but it can be set to a suitable value if supported. There are still some limits to this - its only supported for x86_64 capable processors (and I've only enabled it for x86_64 codegen), and Intel CPUs sometimes zero the upper 32-bits of a pass through register when used for BSR32/BSF32 with a zero source value (i.e. the whole 64bits may not get passed through). Fixes #122004 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 64 ++++-- llvm/lib/Target/X86/X86InstrCompiler.td | 12 +- llvm/lib/Target/X86/X86InstrFragments.td | 13 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 26 ++- llvm/lib/Target/X86/X86InstrMisc.td | 50 ++-- llvm/lib/Target/X86/X86Subtarget.h | 5 + llvm/test/CodeGen/X86/bit_ceil.ll | 12 +- llvm/test/CodeGen/X86/combine-or.ll | 6 +- llvm/test/CodeGen/X86/ctlo.ll | 14 +- llvm/test/CodeGen/X86/ctlz.ll | 31 +-- llvm/test/CodeGen/X86/cttz.ll | 22 +- llvm/test/CodeGen/X86/known-never-zero.ll | 216 ++++++------------ llvm/test/CodeGen/X86/pr89877.ll | 8 +- llvm/test/CodeGen/X86/pr90847.ll | 16 +- llvm/test/CodeGen/X86/pr92569.ll | 10 +- .../CodeGen/X86/scheduler-backtracking.ll | 140 ++++++------ llvm/test/TableGen/x86-fold-tables.inc | 12 +- .../X86/BtVer2/clear-super-register-1.s | 6 +- 18 files changed, 298 insertions(+), 365 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a956074e50d86f..7a9be6f2af9b40 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -436,7 +436,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); if (Subtarget.is64Bit()) { - setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64); setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); } @@ -3386,15 +3385,19 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, } bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { - // Speculate cttz only if we can directly use TZCNT or can promote to i32/i64. + // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to + // i32/i64 or can rely on BSF passthrough value. return Subtarget.hasBMI() || Subtarget.canUseCMOV() || + Subtarget.hasBitScanPassThrough() || (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u)); } bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { - // Speculate ctlz only if we can directly use LZCNT. - return Subtarget.hasLZCNT() || Subtarget.canUseCMOV(); + // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR + // passthrough value. + return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() || + Subtarget.hasBitScanPassThrough(); } bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { @@ -28694,11 +28697,18 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } + // Check if we can safely pass a result though BSR for zero sources. + SDValue PassThru = DAG.getUNDEF(OpVT); + if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() && + !DAG.isKnownNeverZero(Op)) + PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT); + // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); - Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op); - if (Opc == ISD::CTLZ) { + // Skip CMOV if we're using a pass through value. + if (Opc == ISD::CTLZ && PassThru.isUndef()) { // If src is zero (i.e. bsr sets ZF), returns NumBits. SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), @@ -28721,16 +28731,22 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, unsigned NumBits = VT.getScalarSizeInBits(); SDValue N0 = Op.getOperand(0); SDLoc dl(Op); + bool NonZeroSrc = DAG.isKnownNeverZero(N0); assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); + // Check if we can safely pass a result though BSF for zero sources. + SDValue PassThru = DAG.getUNDEF(VT); + if (!NonZeroSrc && Subtarget.hasBitScanPassThrough()) + PassThru = DAG.getConstant(NumBits, dl, VT); + // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); - Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0); - // If src is known never zero we can skip the CMOV. - if (DAG.isKnownNeverZero(N0)) + // Skip CMOV if src is never zero or we're using a pass through value. + if (NonZeroSrc || !PassThru.isUndef()) return Op; // If src is zero (i.e. bsf sets ZF), returns NumBits. @@ -38193,12 +38209,34 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = KnownBits::mul(Known, Known2); break; } - case X86ISD::BSR: - // BSR(0) is undef, but any use of BSR already accounts for non-zero inputs. - // Similar KnownBits behaviour to CTLZ_ZERO_UNDEF. + case X86ISD::BSF: { + Known.Zero.setBitsFrom(Log2_32(BitWidth)); + + KnownBits Known2; + Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + if (Known2.isNonZero()) { + // If we have a known 1, its position is our upper bound. + unsigned PossibleTZ = Known2.countMaxTrailingZeros(); + unsigned LowBits = llvm::bit_width(PossibleTZ); + Known.Zero.setBitsFrom(LowBits); + } else if (!Op.getOperand(0).isUndef()) { + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known = Known.intersectWith(Known2); + } + break; + } + case X86ISD::BSR: { // TODO: Bound with input known bits? Known.Zero.setBitsFrom(Log2_32(BitWidth)); + + if (!Op.getOperand(0).isUndef() && + !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) { + KnownBits Known2; + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known = Known.intersectWith(Known2); + } break; + } case X86ISD::SETCC: Known.Zero.setBitsFrom(1); break; @@ -54243,7 +54281,7 @@ static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, } SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); - Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op); + Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 7d4c5c0e10e492..9bda3fd7d951c9 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -2213,12 +2213,12 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; // Bit scan instruction patterns to match explicit zero-undef behavior. -def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; -def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; -def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; -def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; -def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; -def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; +def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr (i16 (IMPLICIT_DEF)), GR16:$src)>; +def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr (i32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr (i64 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm (i16 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm (i32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm (i64 (IMPLICIT_DEF)), addr:$src)>; // When HasMOVBE is enabled it is possible to get a non-legalized // register-register 16 bit bswap. This maps it to a ROL instruction. diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index ea7af893ce103f..ddbc7c55a6113b 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -134,8 +134,8 @@ def SDTX86Cmpccxadd : SDTypeProfile<1, 4, [SDTCisSameAs<0, 2>, def X86MFence : SDNode<"X86ISD::MFENCE", SDTNone, [SDNPHasChain]>; -def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; -def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; +def X86bsf : SDNode<"X86ISD::BSF", SDTBinaryArithWithFlags>; +def X86bsr : SDNode<"X86ISD::BSR", SDTBinaryArithWithFlags>; def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>; def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>; @@ -685,8 +685,9 @@ def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{ // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may // be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying // anything about the upper 32 bits, they're probably just qualifying a -// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit -// operation will zero-extend up to 64 bits. +// CopyFromReg. FREEZE may be coming from a a truncate. BitScan fall through +// values may not zero the upper bits correctly. +// Any other 32-bit operation will zero-extend up to 64 bits. def def32 : PatLeaf<(i32 GR32:$src), [{ return N->getOpcode() != ISD::TRUNCATE && N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && @@ -694,7 +695,9 @@ def def32 : PatLeaf<(i32 GR32:$src), [{ N->getOpcode() != ISD::AssertSext && N->getOpcode() != ISD::AssertZext && N->getOpcode() != ISD::AssertAlign && - N->getOpcode() != ISD::FREEZE; + N->getOpcode() != ISD::FREEZE && + !((N->getOpcode() == X86ISD::BSF || N->getOpcode() == X86ISD::BSR) && + (!N->getOperand(0).isUndef() && !isa(N->getOperand(0)))); }]>; // Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1baac05827c47c..794aa921ca254d 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5220,42 +5220,43 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, } /// Check whether the use can be converted to remove a comparison against zero. -static X86::CondCode isUseDefConvertible(const MachineInstr &MI) { +/// Returns the EFLAGS condition and the operand that we are comparing against zero. +static std::pair isUseDefConvertible(const MachineInstr &MI) { switch (MI.getOpcode()) { default: - return X86::COND_INVALID; + return std::make_pair(X86::COND_INVALID, ~0U); CASE_ND(NEG8r) CASE_ND(NEG16r) CASE_ND(NEG32r) CASE_ND(NEG64r) - return X86::COND_AE; + return std::make_pair(X86::COND_AE, 1U); case X86::LZCNT16rr: case X86::LZCNT32rr: case X86::LZCNT64rr: - return X86::COND_B; + return std::make_pair(X86::COND_B, 1U); case X86::POPCNT16rr: case X86::POPCNT32rr: case X86::POPCNT64rr: - return X86::COND_E; + return std::make_pair(X86::COND_E, 1U); case X86::TZCNT16rr: case X86::TZCNT32rr: case X86::TZCNT64rr: - return X86::COND_B; + return std::make_pair(X86::COND_B, 1U); case X86::BSF16rr: case X86::BSF32rr: case X86::BSF64rr: case X86::BSR16rr: case X86::BSR32rr: case X86::BSR64rr: - return X86::COND_E; + return std::make_pair(X86::COND_E, 2U); case X86::BLSI32rr: case X86::BLSI64rr: - return X86::COND_AE; + return std::make_pair(X86::COND_AE, 1U); case X86::BLSR32rr: case X86::BLSR64rr: case X86::BLSMSK32rr: case X86::BLSMSK64rr: - return X86::COND_B; + return std::make_pair(X86::COND_B, 1U); // TODO: TBM instructions. } } @@ -5336,6 +5337,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, bool ClearsOverflowFlag = false; bool ShouldUpdateCC = false; bool IsSwapped = false; + unsigned OpNo = 0; X86::CondCode NewCC = X86::COND_INVALID; int64_t ImmDelta = 0; @@ -5391,9 +5393,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // ... // EFLAGS not changed // testl %eax, %eax // <-- can be removed if (IsCmpZero) { - NewCC = isUseDefConvertible(Inst); - if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() && - Inst.getOperand(1).getReg() == SrcReg) { + std::tie(NewCC, OpNo) = isUseDefConvertible(Inst); + if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() && + Inst.getOperand(OpNo).getReg() == SrcReg) { ShouldUpdateCC = true; MI = &Inst; break; diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index 43c02c4f85844c..290d91bb2ce699 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -247,55 +247,55 @@ def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), } // Constraints = "$src = $dst", SchedRW // Bit scan instructions. -let Defs = [EFLAGS] in { -def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), +let Defs = [EFLAGS], Constraints = "$fallback = $dst" in { +def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src), "bsf{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, GR16:$src))]>, TB, OpSize16, Sched<[WriteBSF]>; -def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), +def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src), "bsf{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, (loadi16 addr:$src)))]>, TB, OpSize16, Sched<[WriteBSFLd]>; -def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), +def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, GR32:$src))]>, TB, OpSize32, Sched<[WriteBSF]>; -def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), +def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, (loadi32 addr:$src)))]>, TB, OpSize32, Sched<[WriteBSFLd]>; -def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), +def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src), "bsf{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, GR64:$src))]>, TB, Sched<[WriteBSF]>; -def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), +def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src), "bsf{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, (loadi64 addr:$src)))]>, TB, Sched<[WriteBSFLd]>; -def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), +def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src), "bsr{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, GR16:$src))]>, TB, OpSize16, Sched<[WriteBSR]>; -def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), +def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src), "bsr{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, (loadi16 addr:$src)))]>, TB, OpSize16, Sched<[WriteBSRLd]>; -def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), +def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, GR32:$src))]>, TB, OpSize32, Sched<[WriteBSR]>; -def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), +def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, (loadi32 addr:$src)))]>, TB, OpSize32, Sched<[WriteBSRLd]>; -def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), +def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, GR64:$src))]>, TB, Sched<[WriteBSR]>; -def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), +def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, (loadi64 addr:$src)))]>, TB, Sched<[WriteBSRLd]>; } // Defs = [EFLAGS] diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index e3cb9ee8ce1909..c399989f115d75 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -263,6 +263,11 @@ class X86Subtarget final : public X86GenSubtargetInfo { return hasBWI() && useAVX512Regs(); } + // Returns true if the destination register of a BSF/BSR instruction is + // not touched if the source register is zero. + // NOTE: i32->i64 implicit zext isn't guaranteed by BSR/BSF pass through. + bool hasBitScanPassThrough() const { return is64Bit(); } + bool isXRaySupported() const override { return is64Bit(); } /// Use clflush if we have SSE2 or we're on x86-64 (even if we asked for diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll index 823453087f6180..1f21fcac8341d5 100644 --- a/llvm/test/CodeGen/X86/bit_ceil.ll +++ b/llvm/test/CodeGen/X86/bit_ceil.ll @@ -10,9 +10,8 @@ define i32 @bit_ceil_i32(i32 %x) { ; NOBMI: # %bb.0: ; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi ; NOBMI-NEXT: leal -1(%rdi), %eax -; NOBMI-NEXT: bsrl %eax, %eax ; NOBMI-NEXT: movl $63, %ecx -; NOBMI-NEXT: cmovnel %eax, %ecx +; NOBMI-NEXT: bsrl %eax, %ecx ; NOBMI-NEXT: xorl $31, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx @@ -47,9 +46,8 @@ define i32 @bit_ceil_i32(i32 %x) { define i32 @bit_ceil_i32_plus1(i32 noundef %x) { ; NOBMI-LABEL: bit_ceil_i32_plus1: ; NOBMI: # %bb.0: # %entry -; NOBMI-NEXT: bsrl %edi, %eax ; NOBMI-NEXT: movl $63, %ecx -; NOBMI-NEXT: cmovnel %eax, %ecx +; NOBMI-NEXT: bsrl %edi, %ecx ; NOBMI-NEXT: xorl $31, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx @@ -86,9 +84,8 @@ define i64 @bit_ceil_i64(i64 %x) { ; NOBMI-LABEL: bit_ceil_i64: ; NOBMI: # %bb.0: ; NOBMI-NEXT: leaq -1(%rdi), %rax -; NOBMI-NEXT: bsrq %rax, %rax ; NOBMI-NEXT: movl $127, %ecx -; NOBMI-NEXT: cmovneq %rax, %rcx +; NOBMI-NEXT: bsrq %rax, %rcx ; NOBMI-NEXT: xorl $63, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx @@ -122,9 +119,8 @@ define i64 @bit_ceil_i64(i64 %x) { define i64 @bit_ceil_i64_plus1(i64 noundef %x) { ; NOBMI-LABEL: bit_ceil_i64_plus1: ; NOBMI: # %bb.0: # %entry -; NOBMI-NEXT: bsrq %rdi, %rax ; NOBMI-NEXT: movl $127, %ecx -; NOBMI-NEXT: cmovneq %rax, %rcx +; NOBMI-NEXT: bsrq %rdi, %rcx ; NOBMI-NEXT: xorl $63, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll index d9c6d7053be746..08262e4d34b269 100644 --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -227,9 +227,8 @@ define i64 @PR89533(<64 x i8> %a0) { ; SSE-NEXT: orl %eax, %edx ; SSE-NEXT: shlq $32, %rdx ; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: bsfq %rdx, %rcx ; SSE-NEXT: movl $64, %eax -; SSE-NEXT: cmovneq %rcx, %rax +; SSE-NEXT: rep bsfq %rdx, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: PR89533: @@ -255,9 +254,8 @@ define i64 @PR89533(<64 x i8> %a0) { ; AVX1-NEXT: orl %eax, %edx ; AVX1-NEXT: shlq $32, %rdx ; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: bsfq %rdx, %rcx ; AVX1-NEXT: movl $64, %eax -; AVX1-NEXT: cmovneq %rcx, %rax +; AVX1-NEXT: rep bsfq %rdx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll index 2f4fef82f1f17a..fecb62fbc5aea6 100644 --- a/llvm/test/CodeGen/X86/ctlo.ll +++ b/llvm/test/CodeGen/X86/ctlo.ll @@ -44,10 +44,9 @@ define i8 @ctlo_i8(i8 %x) { ; X64-LABEL: ctlo_i8: ; X64: # %bb.0: ; X64-NEXT: notb %dil -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: movl $15, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %ecx, %eax ; X64-NEXT: xorl $7, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -146,9 +145,8 @@ define i16 @ctlo_i16(i16 %x) { ; X64-LABEL: ctlo_i16: ; X64: # %bb.0: ; X64-NEXT: notl %edi -; X64-NEXT: bsrw %di, %cx ; X64-NEXT: movw $31, %ax -; X64-NEXT: cmovnew %cx, %ax +; X64-NEXT: bsrw %di, %ax ; X64-NEXT: xorl $15, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -232,9 +230,8 @@ define i32 @ctlo_i32(i32 %x) { ; X64-LABEL: ctlo_i32: ; X64: # %bb.0: ; X64-NEXT: notl %edi -; X64-NEXT: bsrl %edi, %ecx ; X64-NEXT: movl $63, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %edi, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq ; @@ -335,9 +332,8 @@ define i64 @ctlo_i64(i64 %x) { ; X64-LABEL: ctlo_i64: ; X64: # %bb.0: ; X64-NEXT: notq %rdi -; X64-NEXT: bsrq %rdi, %rcx ; X64-NEXT: movl $127, %eax -; X64-NEXT: cmovneq %rcx, %rax +; X64-NEXT: bsrq %rdi, %rax ; X64-NEXT: xorq $63, %rax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll index 68defaff78d37d..0eabfeae853f79 100644 --- a/llvm/test/CodeGen/X86/ctlz.ll +++ b/llvm/test/CodeGen/X86/ctlz.ll @@ -246,10 +246,9 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; ; X64-LABEL: ctlz_i8_zero_test: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: movl $15, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %ecx, %eax ; X64-NEXT: xorl $7, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -317,9 +316,8 @@ define i16 @ctlz_i16_zero_test(i16 %n) { ; ; X64-LABEL: ctlz_i16_zero_test: ; X64: # %bb.0: -; X64-NEXT: bsrw %di, %cx ; X64-NEXT: movw $31, %ax -; X64-NEXT: cmovnew %cx, %ax +; X64-NEXT: bsrw %di, %ax ; X64-NEXT: xorl $15, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -372,9 +370,8 @@ define i32 @ctlz_i32_zero_test(i32 %n) { ; ; X64-LABEL: ctlz_i32_zero_test: ; X64: # %bb.0: -; X64-NEXT: bsrl %edi, %ecx ; X64-NEXT: movl $63, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %edi, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq ; @@ -442,9 +439,8 @@ define i64 @ctlz_i64_zero_test(i64 %n) { ; ; X64-LABEL: ctlz_i64_zero_test: ; X64: # %bb.0: -; X64-NEXT: bsrq %rdi, %rcx ; X64-NEXT: movl $127, %eax -; X64-NEXT: cmovneq %rcx, %rax +; X64-NEXT: bsrq %rdi, %rax ; X64-NEXT: xorq $63, %rax ; X64-NEXT: retq ; @@ -613,9 +609,8 @@ define i32 @ctlz_bsr_zero_test(i32 %n) { ; ; X64-LABEL: ctlz_bsr_zero_test: ; X64: # %bb.0: -; X64-NEXT: bsrl %edi, %ecx ; X64-NEXT: movl $63, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_bsr_zero_test: @@ -983,10 +978,9 @@ define i8 @ctlz_xor7_i8_false(i8 %x) { ; ; X64-LABEL: ctlz_xor7_i8_false: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: movl $15, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -1094,9 +1088,8 @@ define i32 @ctlz_xor31_i32_false(i32 %x) { ; ; X64-LABEL: ctlz_xor31_i32_false: ; X64: # %bb.0: -; X64-NEXT: bsrl %edi, %ecx ; X64-NEXT: movl $63, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_xor31_i32_false: @@ -1239,9 +1232,8 @@ define i64 @ctlz_i32_sext(i32 %x) { ; ; X64-LABEL: ctlz_i32_sext: ; X64: # %bb.0: -; X64-NEXT: bsrl %edi, %ecx ; X64-NEXT: movl $63, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i32_sext: @@ -1302,9 +1294,8 @@ define i64 @ctlz_i32_zext(i32 %x) { ; ; X64-LABEL: ctlz_i32_zext: ; X64: # %bb.0: -; X64-NEXT: bsrl %edi, %ecx ; X64-NEXT: movl $63, %eax -; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: bsrl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i32_zext: diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll index 30e5cccfb21982..db949827af0074 100644 --- a/llvm/test/CodeGen/X86/cttz.ll +++ b/llvm/test/CodeGen/X86/cttz.ll @@ -324,11 +324,8 @@ define i32 @cttz_i32_zero_test(i32 %n) { ; ; X64-LABEL: cttz_i32_zero_test: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i32_zero_test: @@ -393,9 +390,8 @@ define i64 @cttz_i64_zero_test(i64 %n) { ; ; X64-LABEL: cttz_i64_zero_test: ; X64: # %bb.0: -; X64-NEXT: bsfq %rdi, %rcx ; X64-NEXT: movl $64, %eax -; X64-NEXT: cmovneq %rcx, %rax +; X64-NEXT: rep bsfq %rdi, %rax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i64_zero_test: @@ -687,10 +683,8 @@ define i64 @cttz_i32_sext(i32 %x) { ; ; X64-LABEL: cttz_i32_sext: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i32_sext: @@ -744,10 +738,8 @@ define i64 @cttz_i32_zext(i32 %x) { ; ; X64-LABEL: cttz_i32_zext: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i32_zext: diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index 6c0aaeb451e14a..63336ffa7c6c8a 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -51,12 +51,9 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) { ; ; X64-LABEL: or_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: orl %esi, %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %z = or i32 %x, %y %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -104,13 +101,11 @@ define i32 @select_maybe_zero(i1 %c, i32 %x) { ; X64-LABEL: select_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: orl $1, %esi -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: testb $1, %dil -; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: cmovnel %esi, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %y = or i32 %x, 1 %z = select i1 %c, i32 %y, i32 0 @@ -201,14 +196,11 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) { ; ; X64-LABEL: shl_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shll %cl, %esi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rsi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %esi, %eax ; X64-NEXT: retq %z = shl nuw nsw i32 %y, %x %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -252,12 +244,10 @@ define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) { ; X64-LABEL: uaddsat_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: addl %esi, %edi -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovael %edi, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: cmovael %edi, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = call i32 @llvm.uadd.sat.i32(i32 %x, i32 %y) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -306,13 +296,10 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) { ; ; X64-LABEL: umax_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: cmpl %esi, %edi ; X64-NEXT: cmoval %edi, %esi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rsi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %esi, %eax ; X64-NEXT: retq %z = call i32 @llvm.umax.i32(i32 %x, i32 %y) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -365,12 +352,10 @@ define i32 @umin_maybe_zero(i32 %x, i32 %y) { ; X64-LABEL: umin_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: cmpl $54, %edi -; X64-NEXT: movl $54, %eax -; X64-NEXT: cmovbl %edi, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $54, %ecx +; X64-NEXT: cmovbl %edi, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = call i32 @llvm.umin.i32(i32 %x, i32 54) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -479,12 +464,10 @@ define i32 @smin_maybe_zero(i32 %x, i32 %y) { ; X64-LABEL: smin_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: cmpl $54, %edi -; X64-NEXT: movl $54, %eax -; X64-NEXT: cmovll %edi, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $54, %ecx +; X64-NEXT: cmovll %edi, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = call i32 @llvm.smin.i32(i32 %x, i32 54) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -593,12 +576,10 @@ define i32 @smax_known_zero(i32 %x, i32 %y) { ; X64-LABEL: smax_known_zero: ; X64: # %bb.0: ; X64-NEXT: testl %edi, %edi -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovnsl %edi, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: cmovnsl %edi, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = call i32 @llvm.smax.i32(i32 %x, i32 -1) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -646,13 +627,10 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) { ; X64-LABEL: rotr_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: rorl %cl, %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %shr = lshr i32 %x, %y %sub = sub i32 32, %y @@ -700,13 +678,10 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) { ; X64-LABEL: rotr_with_fshr_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: rorl %cl, %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %z = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 %y) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -754,13 +729,10 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) { ; X64-LABEL: rotl_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: roll %cl, %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %shl = shl i32 %x, %y %sub = sub i32 32, %y @@ -808,13 +780,10 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) { ; X64-LABEL: rotl_with_fshl_maybe_zero: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: roll %cl, %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %z = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %y) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -880,14 +849,11 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) { ; ; X64-LABEL: sra_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %esi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rsi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %esi, %eax ; X64-NEXT: retq %z = ashr exact i32 %y, %x %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -953,14 +919,11 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) { ; ; X64-LABEL: srl_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %esi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rsi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %esi, %eax ; X64-NEXT: retq %z = lshr exact i32 %y, %x %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1007,11 +970,9 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %esi -; X64-NEXT: # kill: def $eax killed $eax def $rax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %ecx +; X64-NEXT: rep bsfl %eax, %ecx +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq %z = udiv exact i32 %x, %y %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1058,11 +1019,9 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: cltd ; X64-NEXT: idivl %esi -; X64-NEXT: # kill: def $eax killed $eax def $rax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %ecx +; X64-NEXT: rep bsfl %eax, %ecx +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq %z = sdiv exact i32 %x, %y %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1103,13 +1062,10 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) { ; ; X64-LABEL: add_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: orl $1, %edi ; X64-NEXT: addl %esi, %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %x = or i32 %xx, 1 %z = add nsw i32 %x, %y @@ -1182,13 +1138,11 @@ define i32 @sub_maybe_zero(i32 %x) { ; ; X64-LABEL: sub_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: orl $64, %eax -; X64-NEXT: subl %edi, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: orl $64, %ecx +; X64-NEXT: subl %edi, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %y = or i32 %x, 64 %z = sub i32 %y, %x @@ -1208,12 +1162,9 @@ define i32 @sub_maybe_zero2(i32 %x) { ; ; X64-LABEL: sub_maybe_zero2: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: negl %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %z = sub i32 0, %x %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1233,13 +1184,10 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) { ; ; X64-LABEL: mul_known_nonzero_nsw: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: orl $256, %esi # imm = 0x100 ; X64-NEXT: imull %edi, %esi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rsi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %esi, %eax ; X64-NEXT: retq %y = or i32 %yy, 256 %z = mul nsw i32 %y, %x @@ -1260,13 +1208,10 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) { ; ; X64-LABEL: mul_known_nonzero_nuw: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: orl $256, %esi # imm = 0x100 ; X64-NEXT: imull %edi, %esi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rsi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %esi, %eax ; X64-NEXT: retq %y = or i32 %yy, 256 %z = mul nuw i32 %y, %x @@ -1286,12 +1231,9 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) { ; ; X64-LABEL: mul_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: imull %esi, %edi -; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: rep bsfq %rax, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq %z = mul nuw nsw i32 %y, %x %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1321,11 +1263,9 @@ define i32 @bitcast_known_nonzero(<2 x i16> %xx) { ; X64-NEXT: vcvttps2dq %xmm0, %xmm0 ; X64-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,256,u,u,u,u,u,u] -; X64-NEXT: vmovd %xmm0, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: vmovd %xmm0, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %x = shl nuw nsw <2 x i16> , %xx %z = bitcast <2 x i16> %x to i32 @@ -1344,11 +1284,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) { ; ; X64-LABEL: bitcast_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: vmovd %xmm0, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: vmovd %xmm0, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = bitcast <2 x i16> %x to i32 %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1365,11 +1303,9 @@ define i32 @bitcast_from_float(float %x) { ; ; X64-LABEL: bitcast_from_float: ; X64: # %bb.0: -; X64-NEXT: vmovd %xmm0, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: vmovd %xmm0, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = bitcast float %x to i32 %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1412,11 +1348,9 @@ define i32 @zext_maybe_zero(i16 %x) { ; ; X64-LABEL: zext_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: movzwl %di, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = zext i16 %x to i32 %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -1459,11 +1393,9 @@ define i32 @sext_maybe_zero(i16 %x) { ; ; X64-LABEL: sext_maybe_zero: ; X64: # %bb.0: -; X64-NEXT: movswl %di, %eax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movswl %di, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %z = sext i16 %x to i32 %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll index 19baad26583ada..a40ad8f9412788 100644 --- a/llvm/test/CodeGen/X86/pr89877.ll +++ b/llvm/test/CodeGen/X86/pr89877.ll @@ -20,11 +20,9 @@ define i32 @sext_known_nonzero(i16 %xx) { ; X64-NEXT: movl $256, %eax # imm = 0x100 ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shll %cl, %eax -; X64-NEXT: movswq %ax, %rax -; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: rep bsfq %rcx, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: movswl %ax, %ecx +; X64-NEXT: movl $32, %eax +; X64-NEXT: rep bsfl %ecx, %eax ; X64-NEXT: retq %x = shl i16 256, %xx %z = sext i16 %x to i32 diff --git a/llvm/test/CodeGen/X86/pr90847.ll b/llvm/test/CodeGen/X86/pr90847.ll index f2d43c3ed8d5bd..11669f321704e3 100644 --- a/llvm/test/CodeGen/X86/pr90847.ll +++ b/llvm/test/CodeGen/X86/pr90847.ll @@ -14,11 +14,9 @@ define i32 @PR90847(<8 x float> %x) nounwind { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-NEXT: vminps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovmskps %ymm0, %eax -; AVX1-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: rep bsfq %rcx, %rax -; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: vmovmskps %ymm0, %ecx +; AVX1-NEXT: movl $32, %eax +; AVX1-NEXT: rep bsfl %ecx, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -31,11 +29,9 @@ define i32 @PR90847(<8 x float> %x) nounwind { ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-NEXT: vminps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: rep bsfq %rcx, %rax -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vmovmskps %ymm0, %ecx +; AVX2-NEXT: movl $32, %eax +; AVX2-NEXT: rep bsfl %ecx, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll index 0fb4ed7905287c..5f306e998398f2 100644 --- a/llvm/test/CodeGen/X86/pr92569.ll +++ b/llvm/test/CodeGen/X86/pr92569.ll @@ -4,13 +4,11 @@ define void @PR92569(i64 %arg, <8 x i8> %arg1) { ; CHECK-LABEL: PR92569: ; CHECK: # %bb.0: -; CHECK-NEXT: bsfq %rdi, %rax -; CHECK-NEXT: movl $64, %ecx -; CHECK-NEXT: cmovneq %rax, %rcx -; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: rep bsfq %rdi, %rax +; CHECK-NEXT: shrb $3, %al ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: andl $15, %eax +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movzbl -24(%rsp,%rax), %eax ; CHECK-NEXT: movl %eax, 0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll index 6be79edbe51e10..426587a84ce179 100644 --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -234,16 +234,15 @@ define i256 @test2(i256 %a) nounwind { ; ILP-NEXT: xorq $63, %rdx ; ILP-NEXT: andq %rsi, %r11 ; ILP-NEXT: movl $127, %esi -; ILP-NEXT: bsrq %r11, %r8 -; ILP-NEXT: cmoveq %rsi, %r8 -; ILP-NEXT: xorq $63, %r8 -; ILP-NEXT: addq $64, %r8 +; ILP-NEXT: bsrq %r11, %rsi +; ILP-NEXT: xorq $63, %rsi +; ILP-NEXT: addq $64, %rsi ; ILP-NEXT: testq %r10, %r10 -; ILP-NEXT: cmovneq %rdx, %r8 -; ILP-NEXT: subq $-128, %r8 +; ILP-NEXT: cmovneq %rdx, %rsi +; ILP-NEXT: subq $-128, %rsi ; ILP-NEXT: orq %rdi, %r9 -; ILP-NEXT: cmovneq %rcx, %r8 -; ILP-NEXT: movq %r8, (%rax) +; ILP-NEXT: cmovneq %rcx, %rsi +; ILP-NEXT: movq %rsi, (%rax) ; ILP-NEXT: movq $0, 8(%rax) ; ILP-NEXT: retq ; @@ -274,16 +273,15 @@ define i256 @test2(i256 %a) nounwind { ; HYBRID-NEXT: xorq $63, %rdx ; HYBRID-NEXT: andq %rsi, %r11 ; HYBRID-NEXT: movl $127, %esi -; HYBRID-NEXT: bsrq %r11, %r8 -; HYBRID-NEXT: cmoveq %rsi, %r8 -; HYBRID-NEXT: xorq $63, %r8 -; HYBRID-NEXT: addq $64, %r8 +; HYBRID-NEXT: bsrq %r11, %rsi +; HYBRID-NEXT: xorq $63, %rsi +; HYBRID-NEXT: addq $64, %rsi ; HYBRID-NEXT: testq %r10, %r10 -; HYBRID-NEXT: cmovneq %rdx, %r8 -; HYBRID-NEXT: subq $-128, %r8 +; HYBRID-NEXT: cmovneq %rdx, %rsi +; HYBRID-NEXT: subq $-128, %rsi ; HYBRID-NEXT: orq %rdi, %r9 -; HYBRID-NEXT: cmovneq %rcx, %r8 -; HYBRID-NEXT: movq %r8, (%rax) +; HYBRID-NEXT: cmovneq %rcx, %rsi +; HYBRID-NEXT: movq %rsi, (%rax) ; HYBRID-NEXT: movq $0, 8(%rax) ; HYBRID-NEXT: retq ; @@ -314,16 +312,15 @@ define i256 @test2(i256 %a) nounwind { ; BURR-NEXT: xorq $63, %rdx ; BURR-NEXT: andq %rsi, %r11 ; BURR-NEXT: movl $127, %esi -; BURR-NEXT: bsrq %r11, %r8 -; BURR-NEXT: cmoveq %rsi, %r8 -; BURR-NEXT: xorq $63, %r8 -; BURR-NEXT: addq $64, %r8 +; BURR-NEXT: bsrq %r11, %rsi +; BURR-NEXT: xorq $63, %rsi +; BURR-NEXT: addq $64, %rsi ; BURR-NEXT: testq %r10, %r10 -; BURR-NEXT: cmovneq %rdx, %r8 -; BURR-NEXT: subq $-128, %r8 +; BURR-NEXT: cmovneq %rdx, %rsi +; BURR-NEXT: subq $-128, %rsi ; BURR-NEXT: orq %rdi, %r9 -; BURR-NEXT: cmovneq %rcx, %r8 -; BURR-NEXT: movq %r8, (%rax) +; BURR-NEXT: cmovneq %rcx, %rsi +; BURR-NEXT: movq %rsi, (%rax) ; BURR-NEXT: movq $0, 8(%rax) ; BURR-NEXT: retq ; @@ -351,19 +348,18 @@ define i256 @test2(i256 %a) nounwind { ; SRC-NEXT: cmovneq %rcx, %rdx ; SRC-NEXT: bsrq %r10, %rcx ; SRC-NEXT: xorq $63, %rcx +; SRC-NEXT: movl $127, %esi ; SRC-NEXT: bsrq %r11, %rsi -; SRC-NEXT: movl $127, %r8d -; SRC-NEXT: cmovneq %rsi, %r8 -; SRC-NEXT: xorq $63, %r8 -; SRC-NEXT: addq $64, %r8 +; SRC-NEXT: xorq $63, %rsi +; SRC-NEXT: addq $64, %rsi ; SRC-NEXT: testq %r10, %r10 -; SRC-NEXT: cmovneq %rcx, %r8 -; SRC-NEXT: subq $-128, %r8 +; SRC-NEXT: cmovneq %rcx, %rsi +; SRC-NEXT: subq $-128, %rsi ; SRC-NEXT: orq %r9, %rdi -; SRC-NEXT: cmovneq %rdx, %r8 +; SRC-NEXT: cmovneq %rdx, %rsi ; SRC-NEXT: xorps %xmm0, %xmm0 ; SRC-NEXT: movaps %xmm0, 16(%rax) -; SRC-NEXT: movq %r8, (%rax) +; SRC-NEXT: movq %rsi, (%rax) ; SRC-NEXT: movq $0, 8(%rax) ; SRC-NEXT: retq ; @@ -372,12 +368,11 @@ define i256 @test2(i256 %a) nounwind { ; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: xorps %xmm0, %xmm0 ; LIN-NEXT: movaps %xmm0, 16(%rdi) -; LIN-NEXT: movq %rsi, %rdi -; LIN-NEXT: negq %rdi -; LIN-NEXT: andq %rsi, %rdi -; LIN-NEXT: bsrq %rdi, %rsi ; LIN-NEXT: movl $127, %edi -; LIN-NEXT: cmovneq %rsi, %rdi +; LIN-NEXT: movq %rsi, %r9 +; LIN-NEXT: negq %r9 +; LIN-NEXT: andq %rsi, %r9 +; LIN-NEXT: bsrq %r9, %rdi ; LIN-NEXT: xorq $63, %rdi ; LIN-NEXT: addq $64, %rdi ; LIN-NEXT: xorl %esi, %esi @@ -415,7 +410,6 @@ define i256 @test2(i256 %a) nounwind { define i256 @test3(i256 %n) nounwind { ; ILP-LABEL: test3: ; ILP: # %bb.0: -; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rdi, %rax ; ILP-NEXT: xorps %xmm0, %xmm0 ; ILP-NEXT: movaps %xmm0, 16(%rdi) @@ -429,34 +423,32 @@ define i256 @test3(i256 %n) nounwind { ; ILP-NEXT: sbbq %r8, %r9 ; ILP-NEXT: notq %r8 ; ILP-NEXT: andq %r9, %r8 -; ILP-NEXT: bsrq %r8, %rbx +; ILP-NEXT: bsrq %r8, %r9 ; ILP-NEXT: notq %rdx ; ILP-NEXT: andq %r10, %rdx -; ILP-NEXT: bsrq %rdx, %r9 -; ILP-NEXT: xorq $63, %rbx +; ILP-NEXT: bsrq %rdx, %r10 +; ILP-NEXT: xorq $63, %r9 ; ILP-NEXT: notq %rcx ; ILP-NEXT: andq %r11, %rcx -; ILP-NEXT: bsrq %rcx, %r10 +; ILP-NEXT: bsrq %rcx, %r11 +; ILP-NEXT: xorq $63, %r11 +; ILP-NEXT: orq $64, %r11 +; ILP-NEXT: testq %r8, %r8 +; ILP-NEXT: cmovneq %r9, %r11 ; ILP-NEXT: xorq $63, %r10 -; ILP-NEXT: orq $64, %r10 ; ILP-NEXT: notq %rsi -; ILP-NEXT: testq %r8, %r8 -; ILP-NEXT: cmovneq %rbx, %r10 -; ILP-NEXT: xorq $63, %r9 ; ILP-NEXT: andq %rdi, %rsi ; ILP-NEXT: movl $127, %edi -; ILP-NEXT: bsrq %rsi, %rsi -; ILP-NEXT: cmoveq %rdi, %rsi -; ILP-NEXT: xorq $63, %rsi -; ILP-NEXT: addq $64, %rsi +; ILP-NEXT: bsrq %rsi, %rdi +; ILP-NEXT: xorq $63, %rdi +; ILP-NEXT: addq $64, %rdi ; ILP-NEXT: testq %rdx, %rdx -; ILP-NEXT: cmovneq %r9, %rsi -; ILP-NEXT: subq $-128, %rsi +; ILP-NEXT: cmovneq %r10, %rdi +; ILP-NEXT: subq $-128, %rdi ; ILP-NEXT: orq %r8, %rcx -; ILP-NEXT: cmovneq %r10, %rsi -; ILP-NEXT: movq %rsi, (%rax) +; ILP-NEXT: cmovneq %r11, %rdi +; ILP-NEXT: movq %rdi, (%rax) ; ILP-NEXT: movq $0, 8(%rax) -; ILP-NEXT: popq %rbx ; ILP-NEXT: retq ; ; HYBRID-LABEL: test3: @@ -491,16 +483,15 @@ define i256 @test3(i256 %n) nounwind { ; HYBRID-NEXT: notq %rsi ; HYBRID-NEXT: andq %rdi, %rsi ; HYBRID-NEXT: movl $127, %edi -; HYBRID-NEXT: bsrq %rsi, %rsi -; HYBRID-NEXT: cmoveq %rdi, %rsi -; HYBRID-NEXT: xorq $63, %rsi -; HYBRID-NEXT: addq $64, %rsi +; HYBRID-NEXT: bsrq %rsi, %rdi +; HYBRID-NEXT: xorq $63, %rdi +; HYBRID-NEXT: addq $64, %rdi ; HYBRID-NEXT: testq %rdx, %rdx -; HYBRID-NEXT: cmovneq %r10, %rsi -; HYBRID-NEXT: subq $-128, %rsi +; HYBRID-NEXT: cmovneq %r10, %rdi +; HYBRID-NEXT: subq $-128, %rdi ; HYBRID-NEXT: orq %r8, %rcx -; HYBRID-NEXT: cmovneq %r9, %rsi -; HYBRID-NEXT: movq %rsi, (%rax) +; HYBRID-NEXT: cmovneq %r9, %rdi +; HYBRID-NEXT: movq %rdi, (%rax) ; HYBRID-NEXT: movq $0, 8(%rax) ; HYBRID-NEXT: popq %rbx ; HYBRID-NEXT: retq @@ -537,16 +528,15 @@ define i256 @test3(i256 %n) nounwind { ; BURR-NEXT: notq %rsi ; BURR-NEXT: andq %rdi, %rsi ; BURR-NEXT: movl $127, %edi -; BURR-NEXT: bsrq %rsi, %rsi -; BURR-NEXT: cmoveq %rdi, %rsi -; BURR-NEXT: xorq $63, %rsi -; BURR-NEXT: addq $64, %rsi +; BURR-NEXT: bsrq %rsi, %rdi +; BURR-NEXT: xorq $63, %rdi +; BURR-NEXT: addq $64, %rdi ; BURR-NEXT: testq %rdx, %rdx -; BURR-NEXT: cmovneq %r10, %rsi -; BURR-NEXT: subq $-128, %rsi +; BURR-NEXT: cmovneq %r10, %rdi +; BURR-NEXT: subq $-128, %rdi ; BURR-NEXT: orq %r8, %rcx -; BURR-NEXT: cmovneq %r9, %rsi -; BURR-NEXT: movq %rsi, (%rax) +; BURR-NEXT: cmovneq %r9, %rdi +; BURR-NEXT: movq %rdi, (%rax) ; BURR-NEXT: movq $0, 8(%rax) ; BURR-NEXT: popq %rbx ; BURR-NEXT: retq @@ -579,9 +569,8 @@ define i256 @test3(i256 %n) nounwind { ; SRC-NEXT: cmovneq %rdi, %r9 ; SRC-NEXT: bsrq %rdx, %rdi ; SRC-NEXT: xorq $63, %rdi -; SRC-NEXT: bsrq %rsi, %rsi ; SRC-NEXT: movl $127, %r10d -; SRC-NEXT: cmovneq %rsi, %r10 +; SRC-NEXT: bsrq %rsi, %r10 ; SRC-NEXT: xorq $63, %r10 ; SRC-NEXT: addq $64, %r10 ; SRC-NEXT: testq %rdx, %rdx @@ -600,13 +589,12 @@ define i256 @test3(i256 %n) nounwind { ; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: xorps %xmm0, %xmm0 ; LIN-NEXT: movaps %xmm0, 16(%rdi) +; LIN-NEXT: movl $127, %r9d ; LIN-NEXT: movq %rsi, %rdi ; LIN-NEXT: negq %rdi ; LIN-NEXT: notq %rsi ; LIN-NEXT: andq %rdi, %rsi -; LIN-NEXT: bsrq %rsi, %rsi -; LIN-NEXT: movl $127, %r9d -; LIN-NEXT: cmovneq %rsi, %r9 +; LIN-NEXT: bsrq %rsi, %r9 ; LIN-NEXT: xorq $63, %r9 ; LIN-NEXT: addq $64, %r9 ; LIN-NEXT: xorl %edi, %edi diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index 74017ea66529b9..0a8c5b4b76e9d5 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -684,12 +684,6 @@ static const X86FoldTableEntry Table1[] = { {X86::BLSR64rr, X86::BLSR64rm, 0}, {X86::BLSR64rr_EVEX, X86::BLSR64rm_EVEX, 0}, {X86::BLSR64rr_NF, X86::BLSR64rm_NF, 0}, - {X86::BSF16rr, X86::BSF16rm, 0}, - {X86::BSF32rr, X86::BSF32rm, 0}, - {X86::BSF64rr, X86::BSF64rm, 0}, - {X86::BSR16rr, X86::BSR16rm, 0}, - {X86::BSR32rr, X86::BSR32rm, 0}, - {X86::BSR64rr, X86::BSR64rm, 0}, {X86::BZHI32rr, X86::BZHI32rm, 0}, {X86::BZHI32rr_EVEX, X86::BZHI32rm_EVEX, 0}, {X86::BZHI32rr_NF, X86::BZHI32rm_NF, 0}, @@ -2072,6 +2066,12 @@ static const X86FoldTableEntry Table2[] = { {X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16}, {X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16}, {X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16}, + {X86::BSF16rr, X86::BSF16rm, 0}, + {X86::BSF32rr, X86::BSF32rm, 0}, + {X86::BSF64rr, X86::BSF64rm, 0}, + {X86::BSR16rr, X86::BSR16rm, 0}, + {X86::BSR32rr, X86::BSR32rm, 0}, + {X86::BSR64rr, X86::BSR64rm, 0}, {X86::CMOV16rr, X86::CMOV16rm, 0}, {X86::CMOV16rr_ND, X86::CMOV16rm_ND, 0}, {X86::CMOV32rr, X86::CMOV32rm, 0}, diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s index 6483809deda3a9..0bd5f451e2e341 100644 --- a/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s @@ -15,12 +15,12 @@ bsf %rax, %rcx # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 655 +# CHECK-NEXT: Total Cycles: 663 # CHECK-NEXT: Total uOps: 1000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.53 -# CHECK-NEXT: IPC: 0.61 +# CHECK-NEXT: uOps Per Cycle: 1.51 +# CHECK-NEXT: IPC: 0.60 # CHECK-NEXT: Block RThroughput: 5.0 # CHECK: Instruction Info: From 0c66644270abc1455e92301a44232b9af75fafc6 Mon Sep 17 00:00:00 2001 From: Danial Klimkin Date: Thu, 23 Jan 2025 14:20:11 +0100 Subject: [PATCH 135/208] [bazel]Fix bazel build past 2e6cc79f816d942ab09d6a310cd925c1da148aa9 (#124112) Split target under LLVMIR/Transforms to avoid deps loop. --- .../llvm-project-overlay/mlir/BUILD.bazel | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 092c2de414e36e..d9f222982bc010 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5545,10 +5545,22 @@ gentbl_cc_library( cc_library( name = "LLVMIRTransforms", - srcs = glob([ - "lib/Dialect/LLVMIR/Transforms/*.cpp", - ]), - hdrs = glob(["include/mlir/Dialect/LLVMIR/Transforms/*.h"]), + srcs = glob( + [ + "lib/Dialect/LLVMIR/Transforms/*.cpp", + ], + exclude = ["lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp"], + ), + hdrs = glob( + [ + "include/mlir/Dialect/LLVMIR/Transforms/*.h", + ], + exclude = [ + "include/mlir/Dialect/LLVMIR/Transforms/DIExpressionLegalization.h", + "include/mlir/Dialect/LLVMIR/Transforms/DIExpressionRewriter.h", + "include/mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h", + ], + ), includes = ["include"], deps = [ ":Analysis", @@ -5557,6 +5569,7 @@ cc_library( ":IR", ":InliningUtils", ":LLVMDialect", + ":LLVMIRTransformsLegalizeForExport", ":LLVMPassIncGen", ":NVVMDialect", ":Pass", @@ -5567,6 +5580,23 @@ cc_library( ], ) +cc_library( + name = "LLVMIRTransformsLegalizeForExport", + srcs = ["lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp"], + hdrs = [ + "include/mlir/Dialect/LLVMIR/Transforms/DIExpressionLegalization.h", + "include/mlir/Dialect/LLVMIR/Transforms/DIExpressionRewriter.h", + "include/mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h", + ], + includes = ["include"], + deps = [ + ":IR", + ":LLVMPassIncGen", + ":LLVMDialect", + ":Pass", + ], +) + td_library( name = "GPUOpsTdFiles", srcs = [ @@ -6459,6 +6489,7 @@ cc_library( ":NVVMOpsIncGen", ":SideEffectInterfaces", ":Support", + ":ToLLVMIRTranslation", "//llvm:AsmParser", "//llvm:Core", "//llvm:Support", @@ -9212,8 +9243,8 @@ cc_library( ":IR", ":LLVMConversionIncGen", ":LLVMDialect", - ":LLVMIRTransforms", ":LLVMIntrinsicConversionIncGen", + ":LLVMIRTransformsLegalizeForExport", ":OpenMPDialect", ":Support", ":TransformUtils", From 590e5e20b12f9fd956d0ba7de83aa2ab44c9faeb Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Thu, 23 Jan 2025 08:27:09 -0500 Subject: [PATCH 136/208] [M68k] Fix llc pass test after 3630d9ef65b30af7e4ca78e668649bbc48b5be66 --- llvm/test/CodeGen/M68k/pipeline.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll index bc224743e5b707..d61e591505e59d 100644 --- a/llvm/test/CodeGen/M68k/pipeline.ll +++ b/llvm/test/CodeGen/M68k/pipeline.ll @@ -31,6 +31,9 @@ ; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Replace intrinsics with calls to vector library +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Partially inline calls to library functions ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics From d3d605b7cdee132929d32f8b71b01641eb1d6d37 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 23 Jan 2025 13:28:59 +0000 Subject: [PATCH 137/208] [FileCheck] Use move semantics instead of std::swap. NFC. (#123304) This code was using a pre-move-semantics trick of using std::swap to avoid expensive vector copies. --- llvm/lib/FileCheck/FileCheck.cpp | 8 ++++---- llvm/lib/FileCheck/FileCheckImpl.h | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index a6df9672f81008..5706afc357fbd3 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1933,8 +1933,8 @@ bool FileCheck::readCheckFile( } // Okay, add the string we captured to the output vector and move on. - CheckStrings.emplace_back(P, UsedPrefix, PatternLoc); - std::swap(DagNotMatches, CheckStrings.back().DagNotStrings); + CheckStrings.emplace_back(std::move(P), UsedPrefix, PatternLoc, + std::move(DagNotMatches)); DagNotMatches = ImplicitNegativeChecks; } @@ -1963,8 +1963,8 @@ bool FileCheck::readCheckFile( if (!DagNotMatches.empty()) { CheckStrings.emplace_back( Pattern(Check::CheckEOF, PatternContext.get(), LineNumber + 1), - *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data())); - std::swap(DagNotMatches, CheckStrings.back().DagNotStrings); + *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data()), + std::move(DagNotMatches)); } return false; diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h index c772eddd8ecd5e..4715fa9c64b619 100644 --- a/llvm/lib/FileCheck/FileCheckImpl.h +++ b/llvm/lib/FileCheck/FileCheckImpl.h @@ -837,8 +837,9 @@ struct FileCheckString { /// Hold the DAG/NOT strings occurring in the input file. std::vector DagNotStrings; - FileCheckString(const Pattern &P, StringRef S, SMLoc L) - : Pat(P), Prefix(S), Loc(L) {} + FileCheckString(Pattern &&P, StringRef S, SMLoc L, + std::vector &&D) + : Pat(std::move(P)), Prefix(S), Loc(L), DagNotStrings(std::move(D)) {} /// Matches check string and its "not strings" and/or "dag strings". size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode, From fb3fa41aee4733e549620a4aa444525aacb075f7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 7 Jan 2025 18:38:00 +0700 Subject: [PATCH 138/208] MachineRegisterInfo: Use variable for TRI --- llvm/lib/CodeGen/MachineRegisterInfo.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index f058445cc556dc..937f63f6c5e004 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -122,8 +122,8 @@ bool MachineRegisterInfo::recomputeRegClass(Register Reg) { const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); const TargetRegisterClass *OldRC = getRegClass(Reg); - const TargetRegisterClass *NewRC = - getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC, *MF); + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + const TargetRegisterClass *NewRC = TRI->getLargestLegalSuperClass(OldRC, *MF); // Stop early if there is no room to grow. if (NewRC == OldRC) @@ -134,8 +134,7 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) { // Apply the effect of the given operand to NewRC. MachineInstr *MI = MO.getParent(); unsigned OpNo = &MO - &MI->getOperand(0); - NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII, - getTargetRegisterInfo()); + NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII, TRI); if (!NewRC || NewRC == OldRC) return false; } From 6fdaaafd89d7cbc15dafe3ebf1aa3235d148aaab Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Thu, 23 Jan 2025 14:32:01 +0100 Subject: [PATCH 139/208] [AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (#123942) This is meant as a short-term workaround for an invalid conversion in this pass that occurs because existing SDWA selections are not correctly taken into account during the conversion. See the draft PR #123221 for an attempt to fix the actual issue. --------- Co-authored-by: Frederik Harwath --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 7 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 15 ++-- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 15 ++-- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 26 +++--- .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 26 +++--- .../buffer-fat-pointer-atomicrmw-fadd.ll | 28 ++++-- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 48 +++++++--- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 32 +++++-- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 56 +++++++++--- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 32 +++++-- llvm/test/CodeGen/AMDGPU/idot4u.ll | 22 ++--- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 16 +++- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 16 +++- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 3 +- .../AMDGPU/sdwa-peephole-instr-combine-sel.ll | 87 +++++++++++++++++++ .../sdwa-peephole-instr-combine-sel.mir | 56 ++++++++++++ .../AMDGPU/sdwa-peephole-instr-gfx10.mir | 3 +- .../CodeGen/AMDGPU/sdwa-peephole-instr.mir | 7 +- llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 15 ++-- 19 files changed, 400 insertions(+), 110 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 467f042892cebe..bdd164a2f01312 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -962,8 +962,11 @@ bool isConvertibleToSDWA(MachineInstr &MI, const SIInstrInfo* TII) { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); - if (TII->isSDWA(Opc)) - return true; + if (TII->isSDWA(Opc)) { + // FIXME: Reenable after fixing selection handling. + // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll + return false; + } // Check if this instruction has opcode that supports SDWA if (AMDGPU::getSDWAOp(Opc) == -1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index e289ee759da158..2d9e8969fdbb52 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -280,8 +280,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -299,7 +300,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -439,7 +441,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -609,9 +612,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 43ebe156eb2a28..a98b305c15f75c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -281,8 +281,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -300,7 +301,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -440,7 +442,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -610,9 +613,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 788692c94b0cfa..3d7fec9a5986cd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -224,7 +224,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -329,7 +330,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -451,9 +453,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -618,18 +622,20 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s0, s4, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: s_lshl_b32 s0, s4, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 0042d34e235d17..0ab16d95b191d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -218,7 +218,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -321,7 +322,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -439,9 +441,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -602,18 +606,20 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s0, s4, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: s_lshl_b32 s0, s4, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index e8f1619c5d418c..a969e3d4f4f79b 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -6398,8 +6398,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -6625,8 +6627,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -7044,7 +7048,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 @@ -7390,8 +7396,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -7650,8 +7658,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -7915,8 +7925,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -8175,8 +8187,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index ff48a3fc980187..72f883928cffbc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -14349,8 +14349,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14539,8 +14541,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14743,8 +14747,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14924,8 +14930,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15107,8 +15115,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15308,8 +15318,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15502,8 +15514,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15690,8 +15704,10 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15878,8 +15894,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16059,8 +16077,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16244,8 +16264,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16425,8 +16447,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 14f75814128f18..9c2a76380d83dc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -12094,8 +12094,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12316,8 +12318,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12556,8 +12560,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12766,8 +12772,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12978,8 +12986,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13211,8 +13221,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13437,8 +13449,10 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13654,8 +13668,10 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index ec4ea232e661cf..2be6bf302d35f7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -15403,8 +15403,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -15635,8 +15637,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -15867,8 +15871,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16083,8 +16089,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16293,8 +16301,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16504,8 +16514,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16744,8 +16756,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16961,8 +16975,10 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17202,8 +17218,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17440,8 +17458,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17666,8 +17686,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17878,8 +17900,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -18118,8 +18142,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -18356,8 +18382,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 3dbf6477a7cb89..24791b60bfc6d8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -12433,8 +12433,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12711,8 +12713,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12989,8 +12993,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13260,8 +13266,10 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13525,8 +13533,10 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13791,8 +13801,10 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -14077,8 +14089,10 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -14349,8 +14363,10 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 8f82348d350e0a..10fac09ef4ec07 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2518,16 +2518,17 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 -; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 +; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 -; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v9 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2546,16 +2547,17 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v9 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 23b57a7efa586c..e4602f20f8a37c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -5034,8 +5034,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5257,8 +5259,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5474,8 +5478,10 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5688,8 +5694,10 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 1b08b64b046b48..967e972e53e290 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -5532,8 +5532,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5787,8 +5789,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6033,8 +6037,10 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6276,8 +6282,10 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 37bf8516403bf5..4e8248d4be14ec 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -592,7 +592,8 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 % ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll new file mode 100644 index 00000000000000..6eae905278f3ed --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - < %s | FileCheck -check-prefix=CHECK %s + +; The si-peephole-sdwa pass has mishandled the selections of preexisting sdwa instructions +; which led to an instruction of this shape: +; v_lshlrev_b32_sdwa v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; instead of +; v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) { +; CHECK-LABEL: widget: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: v_mov_b32_e32 v2, 8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_ushort v1, v0, s[0:1] +; CHECK-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2 +; CHECK-NEXT: s_bitcmp1_b32 s2, 0 +; CHECK-NEXT: s_cselect_b32 s0, -1, 0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CHECK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; CHECK-NEXT: s_cbranch_vccz .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %bb19 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ds_write_b32 v1, v1 +; CHECK-NEXT: .LBB0_2: ; %bb20 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0 +; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 +; CHECK-NEXT: s_cbranch_execz .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; %bb11 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ds_write_b32 v0, v1 offset:84 +; CHECK-NEXT: .LBB0_4: ; %bb14 +; CHECK-NEXT: s_endpgm +bb: + %call = tail call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %call to i64 + %getelementptr = getelementptr i8, ptr addrspace(1) %arg, i64 %zext + %load = load i8, ptr addrspace(1) %getelementptr, align 1 + %or = or disjoint i32 %call, 1 + %zext4 = zext i32 %or to i64 + %getelementptr5 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext4 + %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1 + %or7 = or disjoint i32 %call, 2 + %zext8 = zext i32 %or7 to i64 + %getelementptr9 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext8 + %load10 = load i8, ptr addrspace(1) %getelementptr9, align 1 + br i1 %arg1, label %bb19, label %bb20 + +bb11: ; preds = %bb20 + %zext12 = zext i8 %load10 to i64 + %getelementptr13 = getelementptr nusw [14 x i32], ptr addrspace(3) inttoptr (i32 84 to ptr addrspace(3)), i64 0, i64 %zext12 + store i32 0, ptr addrspace(3) %getelementptr13, align 4 + br label %bb14 + +bb14: ; preds = %bb20, %bb11 + %zext15 = zext i8 %load6 to i64 + %getelementptr16 = getelementptr [14 x i32], ptr addrspace(3) %arg2, i64 0, i64 %zext15 + %zext17 = zext i8 %load to i64 + %getelementptr18 = getelementptr [14 x i32], ptr addrspace(3) %arg3, i64 0, i64 %zext17 + ret void + +bb19: ; preds = %bb + store i32 0, ptr addrspace(3) null, align 4 + br label %bb20 + +bb20: ; preds = %bb19, %bb + %icmp = icmp eq i8 %load10, 0 + br i1 %icmp, label %bb14, label %bb11 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir new file mode 100644 index 00000000000000..cc2c8b3940d78b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CHECK %s + +# Currently the conversions in si-peephole-sdwa are disabled on preexisting sdwa instructions. +# If they are reenabled, the code matches this pattern instead of the corresponding pattern +# for V_LSHLREV_B32_sdwa further below: +# [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, %{{[0-9]+}}, 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, 6, 0, 6, 5, implicit $exec + +# TODO Implement a fix for the incorrect sdwa selection + +--- +name: sdwa_opsel_hazard +body: | + ; CHECK-LABEL: name: sdwa_opsel_hazard + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[DEF1]], [[DEF2]], 0, 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 undef %5, 255, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef %5, 0, 6, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + bb.0: + successors: %bb.2(0x40000000) + %0:sreg_32 = IMPLICIT_DEF + %1:sreg_64_xexec_xnull = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %1, %2, 0, 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + %5:vgpr_32 = V_AND_B32_e64 undef %6, 255, implicit $exec + %7:vgpr_32 = V_LSHLREV_B32_e64 2, killed undef %5, implicit $exec + S_ENDPGM 0 + + bb.2: + successors: %bb.1(0x40000000) + + %6:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec + + S_BRANCH %bb.1 + +... + diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir index 62538120f84519..aaa32d871148bf 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir @@ -138,7 +138,8 @@ body: | --- # GCN-LABEL: {{^}}name: vop2_instructions -# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir index e2854df2468b39..c027600a8af674 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -147,14 +147,15 @@ body: | --- # GCN-LABEL: {{^}}name: vop2_instructions - -# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec +# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec +# VI: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec -# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec +# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec +# GFX9: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir index ffbd2d092b5d81..467bc77c185779 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir @@ -37,9 +37,10 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec ; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec ; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec + ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec - ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0) - ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec + ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 %2 = COPY $sgpr30_sgpr31 @@ -145,7 +146,7 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[FLAT_LOAD_DWORD]], implicit $exec ; SDWA-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 65535 ; SDWA-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[FLAT_LOAD_DWORD]], killed [[S_MOV_B32_]], implicit $exec - ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[FLAT_LOAD_DWORD]](tied-def 0) + ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[V_AND_B32_e64_]](tied-def 0) ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_MOV_B32_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: S_ENDPGM 0 %2 = COPY $sgpr30_sgpr31 @@ -180,15 +181,17 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec ; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec ; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec + ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec ; SDWA-NEXT: {{ $}} ; SDWA-NEXT: bb.1: ; SDWA-NEXT: successors: %bb.2(0x80000000) ; SDWA-NEXT: {{ $}} - ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec + ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 6, 0, 1, 3, implicit $mode, implicit $exec + ; SDWA-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[V_MUL_F32_sdwa]], implicit $exec ; SDWA-NEXT: {{ $}} ; SDWA-NEXT: bb.2: - ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0) - ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_LSHLREV_B32_e64_]], implicit $exec + ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 bb.0: From d8eb4ac41d881a19bea7673d753ba92e6a11f5d6 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 23 Jan 2025 14:35:32 +0100 Subject: [PATCH 140/208] [Support] Remove ciso646 include (#123578) This header has been removed in C++20 and causes a large amount of deprecation spam when building against libstdc++ 15 in C++17 mode. As far as I understand, we just need to include *some* STL header to get access to the version macros, and as this header also includes nowadays we can just drop the include entirely. --- llvm/include/llvm/Support/Threading.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h index d8e2cb0514ddd7..01e26ad9b858ea 100644 --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -18,7 +18,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX #include "llvm/Support/Compiler.h" -#include // So we can check the C++ standard lib macros. #include #if defined(_MSC_VER) From ff55c9bc63ddd1bbe13376c25ae1fc327e3d5da2 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Thu, 23 Jan 2025 14:53:11 +0100 Subject: [PATCH 141/208] [llvm][amdgpu] Handle indirect refs to LDS GVs during LDS lowering (#124089) Fixes #123800 Extends LDS lowering by allowing it to discover transitive indirect/escpaing references to LDS GVs. For example, given the following input: ```llvm @lds_item_to_indirectly_load = internal addrspace(3) global ptr undef, align 8 %store_type = type { i32, ptr } @place_to_store_indirect_caller = internal addrspace(3) global %store_type undef, align 8 define amdgpu_kernel void @offloading_kernel() { store ptr @indirectly_load_lds, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @place_to_store_indirect_caller, i32 0), align 8 call void @call_unknown() ret void } define void @call_unknown() { %1 = alloca ptr, align 8 %2 = call i32 %1() ret void } define void @indirectly_load_lds() { call void @directly_load_lds() ret void } define void @directly_load_lds() { %2 = load ptr, ptr addrspace(3) @lds_item_to_indirectly_load, align 8 ret void } ``` With the above input, prior to this patch, LDS lowering failed to lower the reference to `@lds_item_to_indirectly_load` because: 1. it is indirectly called by a function whose address is taken in the kernel. 2. we did not check if the kernel indirectly makes any calls to unknown functions (we only checked the direct calls). Co-authored-by: Jon Chesterfield --- llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 52 ++++++++++++++++--- .../AMDGPU/lower-indirect-lds-references.ll | 44 ++++++++++++++++ .../AMDGPU/remove-no-kernel-id-attribute.ll | 2 +- 3 files changed, 90 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index 0406ba9c68ccd3..a5bfdb7bf6eacd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -141,8 +141,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { FunctionVariableMap DirectMapFunction; getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction); - // Collect variables that are used by functions whose address has escaped - DenseSet VariablesReachableThroughFunctionPointer; + // Collect functions whose address has escaped + DenseSet AddressTakenFuncs; for (Function &F : M.functions()) { if (!isKernelLDS(&F)) if (F.hasAddressTaken(nullptr, @@ -150,11 +150,16 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { /* IgnoreAssumeLikeCalls */ false, /* IgnoreLLVMUsed */ true, /* IgnoreArcAttachedCall */ false)) { - set_union(VariablesReachableThroughFunctionPointer, - DirectMapFunction[&F]); + AddressTakenFuncs.insert(&F); } } + // Collect variables that are used by functions whose address has escaped + DenseSet VariablesReachableThroughFunctionPointer; + for (Function *F : AddressTakenFuncs) { + set_union(VariablesReachableThroughFunctionPointer, DirectMapFunction[F]); + } + auto FunctionMakesUnknownCall = [&](const Function *F) -> bool { assert(!F->isDeclaration()); for (const CallGraphNode::CallRecord &R : *CG[F]) { @@ -206,6 +211,13 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { } } + // Collect variables that are transitively used by functions whose address has + // escaped + for (Function *F : AddressTakenFuncs) { + set_union(VariablesReachableThroughFunctionPointer, + TransitiveMapFunction[F]); + } + // DirectMapKernel lists which variables are used by the kernel // find the variables which are used through a function call FunctionVariableMap IndirectMapKernel; @@ -218,11 +230,37 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { Function *Ith = R.second->getFunction(); if (Ith) { set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]); - } else { - set_union(IndirectMapKernel[&Func], - VariablesReachableThroughFunctionPointer); } } + + // Check if the kernel encounters unknows calls, wheher directly or + // indirectly. + bool SeesUnknownCalls = [&]() { + SmallVector WorkList = {CG[&Func]->getFunction()}; + SmallPtrSet Visited; + + while (!WorkList.empty()) { + Function *F = WorkList.pop_back_val(); + + for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) { + if (!CallRecord.second) + continue; + + Function *Callee = CallRecord.second->getFunction(); + if (!Callee) + return true; + + if (Visited.insert(Callee).second) + WorkList.push_back(Callee); + } + } + return false; + }(); + + if (SeesUnknownCalls) { + set_union(IndirectMapKernel[&Func], + VariablesReachableThroughFunctionPointer); + } } // Verify that we fall into one of 2 cases: diff --git a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll new file mode 100644 index 00000000000000..1b0c8d66d3ebc7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll @@ -0,0 +1,44 @@ +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Tests that the LDS lowering pass handles indirect references to LDS GVs; i.e. +; that it lowers to accesses into the generated LDS struct if these references +; are deep in the call graph starting at the kernel. + +@lds_item_to_indirectly_load = internal addrspace(3) global ptr poison, align 8 + +%store_type = type { i32, ptr } +@place_to_store_indirect_caller = internal addrspace(3) global %store_type poison, align 8 + +define amdgpu_kernel void @offloading_kernel() { + store ptr @indirectly_load_lds, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @place_to_store_indirect_caller, i32 0), align 8 + call void @call_unknown() + ret void +} + +define void @call_unknown() { + %1 = alloca ptr, align 8 + %2 = call i32 %1() + ret void +} + +define void @indirectly_load_lds() { + call void @directly_load_lds() + ret void +} + +define void @directly_load_lds() { + %2 = load ptr, ptr addrspace(3) @lds_item_to_indirectly_load, align 8 + ret void +} + +; CHECK: %[[LDS_STRUCT_TY:.*]] = type { %store_type, ptr } +; CHECK: @[[LDS_STRUCT:.*]] = {{.*}} %[[LDS_STRUCT_TY]] {{.*}} !absolute_symbol + +; CHECK: define amdgpu_kernel void @offloading_kernel() {{.*}} { +; CHECK: store ptr @indirectly_load_lds, {{.*}} @[[LDS_STRUCT]] +; CHECK: call void @call_unknown() +; CHECK: } + +; CHECK: define void @directly_load_lds() { +; CHECK: load ptr, {{.*}} (%[[LDS_STRUCT_TY]], {{.*}} @[[LDS_STRUCT]], i32 0, i32 1) +; CHECK: } diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 2850612d700817..1765bd1cfb0086 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -196,7 +196,7 @@ define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="4" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. From 92b839e9c82450a3c465d349de73818e6aad59f3 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 22 Jan 2025 10:19:30 -0500 Subject: [PATCH 142/208] [gn] fix mistake in ee99c4d4845db --- llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 008715a0b3dea5..5146d4141f29b6 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -33,7 +33,7 @@ static_library("Support") { "Windows", ] sources = [ - "AArch64BuildAttributes.cpp" + "AArch64BuildAttributes.cpp", "ABIBreak.cpp", "AMDGPUMetadata.cpp", "APFixedPoint.cpp", @@ -42,7 +42,6 @@ static_library("Support") { "APSInt.cpp", "ARMAttributeParser.cpp", "ARMBuildAttrs.cpp", - "AArch64BuildAttributes.cpp", "ARMWinEH.cpp", "Allocator.cpp", "AutoConvert.cpp", From e28e93550a74752714db6fffe50233aa96e536a5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 Jan 2025 20:58:02 +0700 Subject: [PATCH 143/208] AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 (#123684) For VALU shuffles, this saves an instruction in some case. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 119 ++++++++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 + .../AMDGPU/shufflevector.v2f32.v2f32.ll | 49 ++-- .../AMDGPU/shufflevector.v2f32.v3f32.ll | 40 ++- .../AMDGPU/shufflevector.v2f32.v4f32.ll | 84 +++--- .../AMDGPU/shufflevector.v2f32.v8f32.ll | 272 ++++++++---------- .../AMDGPU/shufflevector.v2i32.v2i32.ll | 49 ++-- .../AMDGPU/shufflevector.v2i32.v3i32.ll | 40 ++- .../AMDGPU/shufflevector.v2i32.v4i32.ll | 84 +++--- .../AMDGPU/shufflevector.v2i32.v8i32.ll | 272 ++++++++---------- .../CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll | 49 ++-- .../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 40 ++- .../CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll | 84 +++--- .../CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll | 272 ++++++++---------- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 96 +++---- .../InferAddressSpaces/AMDGPU/flat_atomic.ll | 3 +- 17 files changed, 728 insertions(+), 833 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6d5c3b5e0742b3..8e90754103ff16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -489,6 +489,95 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); } +void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) { + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + + // TODO: Handle 16-bit element vectors with even aligned masks. + if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) || + VT.getVectorNumElements() != 2) { + SelectCode(N); + return; + } + + auto *SVN = cast(N); + + SDValue Src0 = SVN->getOperand(0); + SDValue Src1 = SVN->getOperand(1); + ArrayRef Mask = SVN->getMask(); + SDLoc DL(N); + + assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 && + Mask[0] < 4 && Mask[1] < 4); + + SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1; + SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1; + unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0; + unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0; + + if (Mask[0] < 0) { + Src0SubReg = Src1SubReg; + MachineSDNode *ImpDef = + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + VSrc0 = SDValue(ImpDef, 0); + } + + if (Mask[1] < 0) { + Src1SubReg = Src0SubReg; + MachineSDNode *ImpDef = + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + VSrc1 = SDValue(ImpDef, 0); + } + + // SGPR case needs to lower to copies. + // + // Also use subregister extract when we can directly blend the registers with + // a simple subregister copy. + // + // TODO: Maybe we should fold this out earlier + if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 && + Src1SubReg == AMDGPU::sub0) { + // The low element of the result always comes from src0. + // The high element of the result always comes from src1. + // op_sel selects the high half of src0. + // op_sel_hi selects the high half of src1. + + unsigned Src0OpSel = + Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + unsigned Src1OpSel = + Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + + // Enable op_sel_hi to avoid printing it. This should have no effect on the + // result. + Src0OpSel |= SISrcMods::OP_SEL_1; + Src1OpSel |= SISrcMods::OP_SEL_1; + + SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32); + SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32); + SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32); + + CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(), + {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1, + ZeroMods, // clamp + ZeroMods, // op_sel + ZeroMods, // op_sel_hi + ZeroMods, // neg_lo + ZeroMods}); // neg_hi + return; + } + + SDValue ResultElt0 = + CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0); + SDValue ResultElt1 = + CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1); + + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; + CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops); +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -562,6 +651,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectBuildVector(N, RegClassID); return; } + case ISD::VECTOR_SHUFFLE: + SelectVectorShuffle(N); + return; case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; SDLoc DL(N); @@ -3101,6 +3193,33 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, } Mods = VecMods; + } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE && + Src.getNumOperands() == 2) { + + // TODO: We should repeat the build_vector source check above for the + // vector_shuffle for negates and casts of individual elements. + + auto *SVN = cast(Src); + ArrayRef Mask = SVN->getMask(); + + if (Mask[0] < 2 && Mask[1] < 2) { + // src1 should be undef. + SDValue ShuffleSrc = SVN->getOperand(0); + + if (ShuffleSrc.getOpcode() == ISD::FNEG) { + ShuffleSrc = ShuffleSrc.getOperand(0); + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + } + + if (Mask[0] == 1) + Mods |= SISrcMods::OP_SEL_0; + if (Mask[1] == 1) + Mods |= SISrcMods::OP_SEL_1; + + Src = ShuffleSrc; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } } // Packed instructions do not have abs modifiers. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 7e61eb470622f1..7dcd208a9cdd41 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -86,6 +86,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { protected: void SelectBuildVector(SDNode *N, unsigned RegClassID); + void SelectVectorShuffle(SDNode *N); private: std::pair foldFrameIndex(SDValue N) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6cf5774fc53b06..1aeca7f370aa1b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -422,6 +422,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, Expand); + if (Subtarget->hasPkMovB32()) { + // TODO: 16-bit element vectors should be legal with even aligned elements. + // TODO: Can be legal with wider source types than the result with + // subregister extracts. + setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal); + } + setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, Custom); diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll index 3410b067fb5b4e..2f6ddc63cb3e47 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -171,15 +171,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -187,15 +186,15 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -274,27 +273,24 @@ define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -447,27 +443,24 @@ define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 7edb6939f884c1..3d42e66eb865c8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -632,10 +632,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -645,10 +644,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -765,13 +763,12 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -786,9 +783,8 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll index ea02b31bff04fd..a312b40a99a813 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll @@ -335,13 +335,12 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -356,9 +355,8 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -447,8 +445,7 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -463,8 +460,8 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -637,8 +634,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -650,8 +646,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -809,9 +804,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -822,9 +816,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -984,13 +977,12 @@ define void @v_shuffle_v2f32_v4f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1005,9 +997,8 @@ define void @v_shuffle_v2f32_v4f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1607,8 +1598,7 @@ define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1610,7 @@ define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1698,8 +1687,7 @@ define void @v_shuffle_v2f32_v4f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1714,8 +1702,8 @@ define void @v_shuffle_v2f32_v4f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2331,9 +2319,8 @@ define void @v_shuffle_v2f32_v4f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2344,9 +2331,8 @@ define void @v_shuffle_v2f32_v4f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2898,8 +2884,7 @@ define void @v_shuffle_v2f32_v4f32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2914,8 +2899,8 @@ define void @v_shuffle_v2f32_v4f32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3005,8 +2990,7 @@ define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3021,8 +3005,8 @@ define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll index 0fc63853f63ab8..2568390d8d7a61 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll @@ -659,13 +659,12 @@ define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v9 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -680,9 +679,8 @@ define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v9 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -771,8 +769,7 @@ define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -787,8 +784,8 @@ define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -878,8 +875,7 @@ define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -894,8 +890,8 @@ define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -985,8 +981,7 @@ define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v15 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1001,8 +996,8 @@ define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1351,8 +1346,7 @@ define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1364,8 +1358,7 @@ define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1523,9 +1516,8 @@ define void @v_shuffle_v2f32_v8f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1536,9 +1528,8 @@ define void @v_shuffle_v2f32_v8f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -1870,13 +1861,12 @@ define void @v_shuffle_v2f32_v8f32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1891,9 +1881,8 @@ define void @v_shuffle_v2f32_v8f32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -1978,13 +1967,12 @@ define void @v_shuffle_v2f32_v8f32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,9 +1987,8 @@ define void @v_shuffle_v2f32_v8f32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -2086,13 +2073,12 @@ define void @v_shuffle_v2f32_v8f32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,9 +2093,8 @@ define void @v_shuffle_v2f32_v8f32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -3089,8 +3074,7 @@ define void @v_shuffle_v2f32_v8f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3102,8 +3086,7 @@ define void @v_shuffle_v2f32_v8f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3352,8 +3335,7 @@ define void @v_shuffle_v2f32_v8f32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3368,8 +3350,8 @@ define void @v_shuffle_v2f32_v8f32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3459,8 +3441,7 @@ define void @v_shuffle_v2f32_v8f32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3475,8 +3456,8 @@ define void @v_shuffle_v2f32_v8f32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3566,8 +3547,7 @@ define void @v_shuffle_v2f32_v8f32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3582,8 +3562,8 @@ define void @v_shuffle_v2f32_v8f32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4650,8 +4630,7 @@ define void @v_shuffle_v2f32_v8f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4663,8 +4642,7 @@ define void @v_shuffle_v2f32_v8f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4827,8 +4805,7 @@ define void @v_shuffle_v2f32_v8f32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4843,8 +4820,8 @@ define void @v_shuffle_v2f32_v8f32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4934,8 +4911,7 @@ define void @v_shuffle_v2f32_v8f32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4950,8 +4926,8 @@ define void @v_shuffle_v2f32_v8f32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5041,8 +5017,7 @@ define void @v_shuffle_v2f32_v8f32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5057,8 +5032,8 @@ define void @v_shuffle_v2f32_v8f32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6211,8 +6186,7 @@ define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6224,8 +6198,7 @@ define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6302,8 +6275,7 @@ define void @v_shuffle_v2f32_v8f32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6318,8 +6290,8 @@ define void @v_shuffle_v2f32_v8f32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6409,8 +6381,7 @@ define void @v_shuffle_v2f32_v8f32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6425,8 +6396,8 @@ define void @v_shuffle_v2f32_v8f32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6516,8 +6487,7 @@ define void @v_shuffle_v2f32_v8f32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6532,8 +6502,8 @@ define void @v_shuffle_v2f32_v8f32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7689,9 +7659,8 @@ define void @v_shuffle_v2f32_v8f32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7702,9 +7671,8 @@ define void @v_shuffle_v2f32_v8f32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -8816,8 +8784,7 @@ define void @v_shuffle_v2f32_v8f32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8832,8 +8799,8 @@ define void @v_shuffle_v2f32_v8f32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8923,8 +8890,7 @@ define void @v_shuffle_v2f32_v8f32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8939,8 +8905,8 @@ define void @v_shuffle_v2f32_v8f32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9030,8 +8996,7 @@ define void @v_shuffle_v2f32_v8f32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9046,8 +9011,8 @@ define void @v_shuffle_v2f32_v8f32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9137,8 +9102,7 @@ define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9153,8 +9117,8 @@ define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9315,8 +9279,7 @@ define void @v_shuffle_v2f32_v8f32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9328,8 +9291,7 @@ define void @v_shuffle_v2f32_v8f32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10354,8 +10316,7 @@ define void @v_shuffle_v2f32_v8f32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10370,8 +10331,8 @@ define void @v_shuffle_v2f32_v8f32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10461,8 +10422,7 @@ define void @v_shuffle_v2f32_v8f32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10477,8 +10437,8 @@ define void @v_shuffle_v2f32_v8f32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10568,8 +10528,7 @@ define void @v_shuffle_v2f32_v8f32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10584,8 +10543,8 @@ define void @v_shuffle_v2f32_v8f32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10675,8 +10634,7 @@ define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10691,8 +10649,8 @@ define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10941,8 +10899,7 @@ define void @v_shuffle_v2f32_v8f32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10954,8 +10911,7 @@ define void @v_shuffle_v2f32_v8f32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11892,8 +11848,7 @@ define void @v_shuffle_v2f32_v8f32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11908,8 +11863,8 @@ define void @v_shuffle_v2f32_v8f32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11999,8 +11954,7 @@ define void @v_shuffle_v2f32_v8f32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12015,8 +11969,8 @@ define void @v_shuffle_v2f32_v8f32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12106,8 +12060,7 @@ define void @v_shuffle_v2f32_v8f32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12122,8 +12075,8 @@ define void @v_shuffle_v2f32_v8f32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12213,8 +12166,7 @@ define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12229,8 +12181,8 @@ define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll index 632e8d2a32bad8..2d27d7199ddf42 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -171,15 +171,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -187,15 +186,15 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -274,27 +273,24 @@ define void @v_shuffle_v2i32_v2i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -447,27 +443,24 @@ define void @v_shuffle_v2i32_v2i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_0: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index fb6671ca787012..ea08df2e4f50ff 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -632,10 +632,9 @@ define void @v_shuffle_v2i32_v3i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -645,10 +644,9 @@ define void @v_shuffle_v2i32_v3i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -765,13 +763,12 @@ define void @v_shuffle_v2i32_v3i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -786,9 +783,8 @@ define void @v_shuffle_v2i32_v3i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2i32_v3i32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2i32_v3i32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll index b4051228a443e8..a2431d56ce2fd6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll @@ -335,13 +335,12 @@ define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -356,9 +355,8 @@ define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -447,8 +445,7 @@ define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -463,8 +460,8 @@ define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -637,8 +634,7 @@ define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -650,8 +646,7 @@ define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -809,9 +804,8 @@ define void @v_shuffle_v2i32_v4i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -822,9 +816,8 @@ define void @v_shuffle_v2i32_v4i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -984,13 +977,12 @@ define void @v_shuffle_v2i32_v4i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1005,9 +997,8 @@ define void @v_shuffle_v2i32_v4i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1607,8 +1598,7 @@ define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1610,7 @@ define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1698,8 +1687,7 @@ define void @v_shuffle_v2i32_v4i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1714,8 +1702,8 @@ define void @v_shuffle_v2i32_v4i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2331,9 +2319,8 @@ define void @v_shuffle_v2i32_v4i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2344,9 +2331,8 @@ define void @v_shuffle_v2i32_v4i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2898,8 +2884,7 @@ define void @v_shuffle_v2i32_v4i32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2914,8 +2899,8 @@ define void @v_shuffle_v2i32_v4i32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3005,8 +2990,7 @@ define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3021,8 +3005,8 @@ define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll index 11d1b88a938f2e..83a51bc87eccf1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll @@ -659,13 +659,12 @@ define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v9 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -680,9 +679,8 @@ define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v9 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -771,8 +769,7 @@ define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -787,8 +784,8 @@ define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -878,8 +875,7 @@ define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -894,8 +890,8 @@ define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -985,8 +981,7 @@ define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v15 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1001,8 +996,8 @@ define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1351,8 +1346,7 @@ define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1364,8 +1358,7 @@ define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1523,9 +1516,8 @@ define void @v_shuffle_v2i32_v8i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1536,9 +1528,8 @@ define void @v_shuffle_v2i32_v8i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -1870,13 +1861,12 @@ define void @v_shuffle_v2i32_v8i32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1891,9 +1881,8 @@ define void @v_shuffle_v2i32_v8i32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -1978,13 +1967,12 @@ define void @v_shuffle_v2i32_v8i32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,9 +1987,8 @@ define void @v_shuffle_v2i32_v8i32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -2086,13 +2073,12 @@ define void @v_shuffle_v2i32_v8i32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,9 +2093,8 @@ define void @v_shuffle_v2i32_v8i32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -3089,8 +3074,7 @@ define void @v_shuffle_v2i32_v8i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3102,8 +3086,7 @@ define void @v_shuffle_v2i32_v8i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3352,8 +3335,7 @@ define void @v_shuffle_v2i32_v8i32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3368,8 +3350,8 @@ define void @v_shuffle_v2i32_v8i32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3459,8 +3441,7 @@ define void @v_shuffle_v2i32_v8i32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3475,8 +3456,8 @@ define void @v_shuffle_v2i32_v8i32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3566,8 +3547,7 @@ define void @v_shuffle_v2i32_v8i32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3582,8 +3562,8 @@ define void @v_shuffle_v2i32_v8i32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4650,8 +4630,7 @@ define void @v_shuffle_v2i32_v8i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4663,8 +4642,7 @@ define void @v_shuffle_v2i32_v8i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4827,8 +4805,7 @@ define void @v_shuffle_v2i32_v8i32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4843,8 +4820,8 @@ define void @v_shuffle_v2i32_v8i32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4934,8 +4911,7 @@ define void @v_shuffle_v2i32_v8i32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4950,8 +4926,8 @@ define void @v_shuffle_v2i32_v8i32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5041,8 +5017,7 @@ define void @v_shuffle_v2i32_v8i32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5057,8 +5032,8 @@ define void @v_shuffle_v2i32_v8i32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6211,8 +6186,7 @@ define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6224,8 +6198,7 @@ define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6302,8 +6275,7 @@ define void @v_shuffle_v2i32_v8i32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6318,8 +6290,8 @@ define void @v_shuffle_v2i32_v8i32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6409,8 +6381,7 @@ define void @v_shuffle_v2i32_v8i32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6425,8 +6396,8 @@ define void @v_shuffle_v2i32_v8i32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6516,8 +6487,7 @@ define void @v_shuffle_v2i32_v8i32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6532,8 +6502,8 @@ define void @v_shuffle_v2i32_v8i32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7689,9 +7659,8 @@ define void @v_shuffle_v2i32_v8i32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7702,9 +7671,8 @@ define void @v_shuffle_v2i32_v8i32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -8816,8 +8784,7 @@ define void @v_shuffle_v2i32_v8i32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8832,8 +8799,8 @@ define void @v_shuffle_v2i32_v8i32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8923,8 +8890,7 @@ define void @v_shuffle_v2i32_v8i32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8939,8 +8905,8 @@ define void @v_shuffle_v2i32_v8i32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9030,8 +8996,7 @@ define void @v_shuffle_v2i32_v8i32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9046,8 +9011,8 @@ define void @v_shuffle_v2i32_v8i32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9137,8 +9102,7 @@ define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9153,8 +9117,8 @@ define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9315,8 +9279,7 @@ define void @v_shuffle_v2i32_v8i32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9328,8 +9291,7 @@ define void @v_shuffle_v2i32_v8i32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10354,8 +10316,7 @@ define void @v_shuffle_v2i32_v8i32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10370,8 +10331,8 @@ define void @v_shuffle_v2i32_v8i32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10461,8 +10422,7 @@ define void @v_shuffle_v2i32_v8i32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10477,8 +10437,8 @@ define void @v_shuffle_v2i32_v8i32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10568,8 +10528,7 @@ define void @v_shuffle_v2i32_v8i32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10584,8 +10543,8 @@ define void @v_shuffle_v2i32_v8i32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10675,8 +10634,7 @@ define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10691,8 +10649,8 @@ define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10941,8 +10899,7 @@ define void @v_shuffle_v2i32_v8i32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10954,8 +10911,7 @@ define void @v_shuffle_v2i32_v8i32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11892,8 +11848,7 @@ define void @v_shuffle_v2i32_v8i32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11908,8 +11863,8 @@ define void @v_shuffle_v2i32_v8i32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11999,8 +11954,7 @@ define void @v_shuffle_v2i32_v8i32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12015,8 +11969,8 @@ define void @v_shuffle_v2i32_v8i32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12106,8 +12060,7 @@ define void @v_shuffle_v2i32_v8i32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12122,8 +12075,8 @@ define void @v_shuffle_v2i32_v8i32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12213,8 +12166,7 @@ define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12229,8 +12181,8 @@ define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll index 2cb50e0493ae0a..6d5005a8998325 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -171,15 +171,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -187,15 +186,15 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -274,27 +273,24 @@ define void @v_shuffle_v2p3_v2p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -447,27 +443,24 @@ define void @v_shuffle_v2p3_v2p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_0: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index b92fa40a269996..2c8f2952fd1065 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -632,10 +632,9 @@ define void @v_shuffle_v2p3_v3p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -645,10 +644,9 @@ define void @v_shuffle_v2p3_v3p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -765,13 +763,12 @@ define void @v_shuffle_v2p3_v3p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -786,9 +783,8 @@ define void @v_shuffle_v2p3_v3p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2p3_v3p3__4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2p3_v3p3__4_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll index 8080c22d792198..20abdd10f949ee 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll @@ -335,13 +335,12 @@ define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -356,9 +355,8 @@ define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -447,8 +445,7 @@ define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -463,8 +460,8 @@ define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -637,8 +634,7 @@ define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -650,8 +646,7 @@ define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -809,9 +804,8 @@ define void @v_shuffle_v2p3_v4p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -822,9 +816,8 @@ define void @v_shuffle_v2p3_v4p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -984,13 +977,12 @@ define void @v_shuffle_v2p3_v4p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1005,9 +997,8 @@ define void @v_shuffle_v2p3_v4p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1607,8 +1598,7 @@ define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1610,7 @@ define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1698,8 +1687,7 @@ define void @v_shuffle_v2p3_v4p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1714,8 +1702,8 @@ define void @v_shuffle_v2p3_v4p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2331,9 +2319,8 @@ define void @v_shuffle_v2p3_v4p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2344,9 +2331,8 @@ define void @v_shuffle_v2p3_v4p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2898,8 +2884,7 @@ define void @v_shuffle_v2p3_v4p3__1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2914,8 +2899,8 @@ define void @v_shuffle_v2p3_v4p3__1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3005,8 +2990,7 @@ define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3021,8 +3005,8 @@ define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll index 02a5800ce1896a..df7bdbf04d4e31 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll @@ -659,13 +659,12 @@ define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v9 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -680,9 +679,8 @@ define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v9 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -771,8 +769,7 @@ define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -787,8 +784,8 @@ define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -878,8 +875,7 @@ define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -894,8 +890,8 @@ define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -985,8 +981,7 @@ define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v15 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1001,8 +996,8 @@ define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1351,8 +1346,7 @@ define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1364,8 +1358,7 @@ define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1523,9 +1516,8 @@ define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1536,9 +1528,8 @@ define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1870,13 +1861,12 @@ define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1891,9 +1881,8 @@ define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1978,13 +1967,12 @@ define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,9 +1987,8 @@ define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2086,13 +2073,12 @@ define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,9 +2093,8 @@ define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3089,8 +3074,7 @@ define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3102,8 +3086,7 @@ define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3352,8 +3335,7 @@ define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3368,8 +3350,8 @@ define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3459,8 +3441,7 @@ define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3475,8 +3456,8 @@ define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3566,8 +3547,7 @@ define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3582,8 +3562,8 @@ define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4650,8 +4630,7 @@ define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4663,8 +4642,7 @@ define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4827,8 +4805,7 @@ define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4843,8 +4820,8 @@ define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4934,8 +4911,7 @@ define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4950,8 +4926,8 @@ define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5041,8 +5017,7 @@ define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5057,8 +5032,8 @@ define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6211,8 +6186,7 @@ define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6224,8 +6198,7 @@ define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6302,8 +6275,7 @@ define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6318,8 +6290,8 @@ define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6409,8 +6381,7 @@ define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6425,8 +6396,8 @@ define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6516,8 +6487,7 @@ define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6532,8 +6502,8 @@ define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7689,9 +7659,8 @@ define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7702,9 +7671,8 @@ define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8816,8 +8784,7 @@ define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8832,8 +8799,8 @@ define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8923,8 +8890,7 @@ define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8939,8 +8905,8 @@ define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9030,8 +8996,7 @@ define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9046,8 +9011,8 @@ define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9137,8 +9102,7 @@ define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9153,8 +9117,8 @@ define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9315,8 +9279,7 @@ define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9328,8 +9291,7 @@ define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10354,8 +10316,7 @@ define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10370,8 +10331,8 @@ define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10461,8 +10422,7 @@ define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10477,8 +10437,8 @@ define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10568,8 +10528,7 @@ define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10584,8 +10543,8 @@ define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10675,8 +10634,7 @@ define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10691,8 +10649,8 @@ define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10941,8 +10899,7 @@ define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10954,8 +10911,7 @@ define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11892,8 +11848,7 @@ define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11908,8 +11863,8 @@ define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11999,8 +11954,7 @@ define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12015,8 +11969,8 @@ define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12106,8 +12060,7 @@ define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12122,8 +12075,8 @@ define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12213,8 +12166,7 @@ define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12229,8 +12181,8 @@ define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index e7ae9d831424cc..b85bd4c6346684 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -4942,78 +4942,78 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; ; GFX940-LABEL: fma_shuffle_v2bf16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX940-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0x7060302 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] -; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] +; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX940-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11] ; GFX940-NEXT: s_waitcnt vmcnt(2) -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_fmac_f32_e32 v7, v8, v9 -; GFX940-NEXT: v_fmac_f32_e32 v0, v8, v4 -; GFX940-NEXT: v_fmac_f32_e32 v1, v12, v4 -; GFX940-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX940-NEXT: v_fmac_f32_e32 v11, v12, v9 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX940-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX940-NEXT: v_add3_u32 v4, v4, v7, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX940-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX940-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX940-NEXT: v_add3_u32 v9, v9, v0, s2 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX940-NEXT: v_bfe_u32 v15, v1, 16, 1 -; GFX940-NEXT: v_add3_u32 v13, v13, v11, s2 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_fmac_f32_e32 v8, v7, v9 +; GFX940-NEXT: v_fmac_f32_e32 v2, v7, v4 +; GFX940-NEXT: v_fmac_f32_e32 v3, v11, v4 +; GFX940-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX940-NEXT: v_fmac_f32_e32 v12, v11, v9 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX940-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX940-NEXT: v_add3_u32 v4, v4, v8, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX940-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX940-NEXT: v_add3_u32 v9, v9, v2, s2 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX940-NEXT: v_bfe_u32 v15, v3, 16, 1 +; GFX940-NEXT: v_add3_u32 v13, v13, v12, s2 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX940-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v15, v15, v1, s2 +; GFX940-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v15, v15, v3, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc ; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX940-NEXT: v_fmac_f32_e32 v0, v2, v10 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX940-NEXT: v_fmac_f32_e32 v2, v0, v10 ; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX940-NEXT: v_fmac_f32_e32 v4, v2, v5 -; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX940-NEXT: v_fmac_f32_e32 v1, v3, v10 -; GFX940-NEXT: v_fmac_f32_e32 v7, v3, v5 -; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX940-NEXT: v_fmac_f32_e32 v4, v0, v5 +; GFX940-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX940-NEXT: v_fmac_f32_e32 v3, v1, v10 +; GFX940-NEXT: v_fmac_f32_e32 v7, v1, v5 +; GFX940-NEXT: v_or_b32_e32 v1, 0x400000, v2 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_add3_u32 v0, v0, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX940-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX940-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX940-NEXT: v_add3_u32 v9, v9, v1, s2 +; GFX940-NEXT: v_add3_u32 v9, v9, v3, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX940-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX940-NEXT: v_add3_u32 v11, v11, v7, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 6633cec659d8e5..39af91b81110d0 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -70,8 +70,7 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: s_mov_b64 s[6:7], exec ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s9 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) From 0c71fdd1575b826cbb3c252ee0b15fc84559abec Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 23 Jan 2025 08:14:52 -0600 Subject: [PATCH 144/208] [NVPTX] Fix ctor / dtor lowering when NVPTX target is not enabled (#124116) Summary: We pass the `-nvptx-lower-global-ctor-dtor` option to support the `libc` like use-case which needs global constructors sometimes. This only affects the backend. If the NVPTX target is not enabled this option will be unknown which prevents you from compiling generic IR for this. --- clang/lib/Driver/ToolChains/Cuda.cpp | 5 ++++- clang/test/Driver/cuda-cross-compiling.c | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index d4099216c81ba8..0922a97ed7c19d 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -639,6 +639,9 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back( Args.MakeArgString("--plugin-opt=-mattr=" + llvm::join(Features, ","))); + // Enable ctor / dtor lowering for the direct / freestanding NVPTX target. + CmdArgs.append({"-mllvm", "--nvptx-lower-global-ctor-dtor"}); + // Add paths for the default clang library path. SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(TC.getDriver().Dir); @@ -783,7 +786,7 @@ void NVPTXToolChain::addClangTargetOptions( // If we are compiling with a standalone NVPTX toolchain we want to try to // mimic a standard environment as much as possible. So we enable lowering // ctor / dtor functions to global symbols that can be registered. - if (Freestanding) + if (Freestanding && !getDriver().isUsingLTO()) CC1Args.append({"-mllvm", "--nvptx-lower-global-ctor-dtor"}); } diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c index baf37048300315..7817e462c47be9 100644 --- a/clang/test/Driver/cuda-cross-compiling.c +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -63,8 +63,12 @@ // // RUN: %clang -target nvptx64-nvidia-cuda -march=sm_70 %s -### 2>&1 \ // RUN: | FileCheck -check-prefix=LOWERING %s +// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_70 -flto -c %s -### 2>&1 \ +// RUN: | FileCheck -check-prefix=LOWERING-LTO %s // LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor" +// LOWERING: clang-nvlink-wrapper{{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor" +// LOWERING-LTO-NOT: "--nvptx-lower-global-ctor-dtor" // // Test passing arguments directly to nvlink. From 99d450e9f51683bad608bf801e1b29e5c54b8917 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 23 Jan 2025 09:19:42 -0500 Subject: [PATCH 145/208] Revert "[AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (#123942)" This reverts commit 6fdaaafd89d7cbc15dafe3ebf1aa3235d148aaab. Breaks check-llvm, see https://github.com/llvm/llvm-project/pull/123942#issuecomment-2609861953 --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 7 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 15 ++-- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 15 ++-- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 26 +++--- .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 26 +++--- .../buffer-fat-pointer-atomicrmw-fadd.ll | 28 ++---- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 48 +++------- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 32 ++----- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 56 +++--------- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 32 ++----- llvm/test/CodeGen/AMDGPU/idot4u.ll | 22 +++-- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 16 +--- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 16 +--- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 3 +- .../AMDGPU/sdwa-peephole-instr-combine-sel.ll | 87 ------------------- .../sdwa-peephole-instr-combine-sel.mir | 56 ------------ .../AMDGPU/sdwa-peephole-instr-gfx10.mir | 3 +- .../CodeGen/AMDGPU/sdwa-peephole-instr.mir | 7 +- llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 15 ++-- 19 files changed, 110 insertions(+), 400 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bdd164a2f01312..467f042892cebe 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -962,11 +962,8 @@ bool isConvertibleToSDWA(MachineInstr &MI, const SIInstrInfo* TII) { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); - if (TII->isSDWA(Opc)) { - // FIXME: Reenable after fixing selection handling. - // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll - return false; - } + if (TII->isSDWA(Opc)) + return true; // Check if this instruction has opcode that supports SDWA if (AMDGPU::getSDWAOp(Opc) == -1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 2d9e8969fdbb52..e289ee759da158 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -280,9 +280,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -300,8 +299,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -441,8 +439,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -612,11 +609,9 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index a98b305c15f75c..43ebe156eb2a28 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -281,9 +281,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -301,8 +300,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -442,8 +440,7 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -613,11 +610,9 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 3d7fec9a5986cd..788692c94b0cfa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -224,8 +224,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -330,8 +329,7 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -453,11 +451,9 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -622,20 +618,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 0ab16d95b191d9..0042d34e235d17 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -218,8 +218,7 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -322,8 +321,7 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -441,11 +439,9 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -606,20 +602,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index a969e3d4f4f79b..e8f1619c5d418c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -6398,10 +6398,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -6627,10 +6625,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -7048,9 +7044,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 @@ -7396,10 +7390,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -7658,10 +7650,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -7925,10 +7915,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -8187,10 +8175,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 72f883928cffbc..ff48a3fc980187 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -14349,10 +14349,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14541,10 +14539,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14747,10 +14743,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14930,10 +14924,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15115,10 +15107,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15318,10 +15308,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15514,10 +15502,8 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15704,10 +15690,8 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15894,10 +15878,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16077,10 +16059,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16264,10 +16244,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16447,10 +16425,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 9c2a76380d83dc..14f75814128f18 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -12094,10 +12094,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12318,10 +12316,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12560,10 +12556,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12772,10 +12766,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12986,10 +12978,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13221,10 +13211,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13449,10 +13437,8 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13668,10 +13654,8 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 2be6bf302d35f7..ec4ea232e661cf 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -15403,10 +15403,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -15637,10 +15635,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -15871,10 +15867,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16089,10 +16083,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16301,10 +16293,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16514,10 +16504,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16756,10 +16744,8 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16975,10 +16961,8 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17218,10 +17202,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17458,10 +17440,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17686,10 +17666,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17900,10 +17878,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -18142,10 +18118,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -18382,10 +18356,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 24791b60bfc6d8..3dbf6477a7cb89 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -12433,10 +12433,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12713,10 +12711,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12993,10 +12989,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13266,10 +13260,8 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13533,10 +13525,8 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13801,10 +13791,8 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -14089,10 +14077,8 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -14363,10 +14349,8 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 10fac09ef4ec07..8f82348d350e0a 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2518,17 +2518,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 -; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 -; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v9 +; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2547,17 +2546,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v9 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index e4602f20f8a37c..23b57a7efa586c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -5034,10 +5034,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5259,10 +5257,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5478,10 +5474,8 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5694,10 +5688,8 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 967e972e53e290..1b08b64b046b48 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -5532,10 +5532,8 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5789,10 +5787,8 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6037,10 +6033,8 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6282,10 +6276,8 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 4e8248d4be14ec..37bf8516403bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -592,8 +592,7 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 % ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll deleted file mode 100644 index 6eae905278f3ed..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll +++ /dev/null @@ -1,87 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - < %s | FileCheck -check-prefix=CHECK %s - -; The si-peephole-sdwa pass has mishandled the selections of preexisting sdwa instructions -; which led to an instruction of this shape: -; v_lshlrev_b32_sdwa v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; instead of -; v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 - -define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) { -; CHECK-LABEL: widget: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 -; CHECK-NEXT: v_mov_b32_e32 v2, 8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_ushort v1, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2 -; CHECK-NEXT: s_bitcmp1_b32 s2, 0 -; CHECK-NEXT: s_cselect_b32 s0, -1, 0 -; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; CHECK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; CHECK-NEXT: s_cbranch_vccz .LBB0_2 -; CHECK-NEXT: ; %bb.1: ; %bb19 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: ds_write_b32 v1, v1 -; CHECK-NEXT: .LBB0_2: ; %bb20 -; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CHECK-NEXT: s_mov_b32 s0, exec_lo -; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0 -; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execz .LBB0_4 -; CHECK-NEXT: ; %bb.3: ; %bb11 -; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: ds_write_b32 v0, v1 offset:84 -; CHECK-NEXT: .LBB0_4: ; %bb14 -; CHECK-NEXT: s_endpgm -bb: - %call = tail call i32 @llvm.amdgcn.workitem.id.x() - %zext = zext i32 %call to i64 - %getelementptr = getelementptr i8, ptr addrspace(1) %arg, i64 %zext - %load = load i8, ptr addrspace(1) %getelementptr, align 1 - %or = or disjoint i32 %call, 1 - %zext4 = zext i32 %or to i64 - %getelementptr5 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext4 - %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1 - %or7 = or disjoint i32 %call, 2 - %zext8 = zext i32 %or7 to i64 - %getelementptr9 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext8 - %load10 = load i8, ptr addrspace(1) %getelementptr9, align 1 - br i1 %arg1, label %bb19, label %bb20 - -bb11: ; preds = %bb20 - %zext12 = zext i8 %load10 to i64 - %getelementptr13 = getelementptr nusw [14 x i32], ptr addrspace(3) inttoptr (i32 84 to ptr addrspace(3)), i64 0, i64 %zext12 - store i32 0, ptr addrspace(3) %getelementptr13, align 4 - br label %bb14 - -bb14: ; preds = %bb20, %bb11 - %zext15 = zext i8 %load6 to i64 - %getelementptr16 = getelementptr [14 x i32], ptr addrspace(3) %arg2, i64 0, i64 %zext15 - %zext17 = zext i8 %load to i64 - %getelementptr18 = getelementptr [14 x i32], ptr addrspace(3) %arg3, i64 0, i64 %zext17 - ret void - -bb19: ; preds = %bb - store i32 0, ptr addrspace(3) null, align 4 - br label %bb20 - -bb20: ; preds = %bb19, %bb - %icmp = icmp eq i8 %load10, 0 - br i1 %icmp, label %bb14, label %bb11 -} - -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare noundef i32 @llvm.amdgcn.workitem.id.x() #0 - -attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir deleted file mode 100644 index cc2c8b3940d78b..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir +++ /dev/null @@ -1,56 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CHECK %s - -# Currently the conversions in si-peephole-sdwa are disabled on preexisting sdwa instructions. -# If they are reenabled, the code matches this pattern instead of the corresponding pattern -# for V_LSHLREV_B32_sdwa further below: -# [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, %{{[0-9]+}}, 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, 6, 0, 6, 5, implicit $exec - -# TODO Implement a fix for the incorrect sdwa selection - ---- -name: sdwa_opsel_hazard -body: | - ; CHECK-LABEL: name: sdwa_opsel_hazard - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[DEF1]], [[DEF2]], 0, 0, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 undef %5, 255, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; CHECK-NEXT: [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef %5, 0, 6, 0, 6, 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 - bb.0: - successors: %bb.2(0x40000000) - %0:sreg_32 = IMPLICIT_DEF - %1:sreg_64_xexec_xnull = IMPLICIT_DEF - %2:vgpr_32 = IMPLICIT_DEF - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %1, %2, 0, 0, implicit $exec - S_BRANCH %bb.2 - - bb.1: - %5:vgpr_32 = V_AND_B32_e64 undef %6, 255, implicit $exec - %7:vgpr_32 = V_LSHLREV_B32_e64 2, killed undef %5, implicit $exec - S_ENDPGM 0 - - bb.2: - successors: %bb.1(0x40000000) - - %6:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec - - S_BRANCH %bb.1 - -... - diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir index aaa32d871148bf..62538120f84519 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir @@ -138,8 +138,7 @@ body: | --- # GCN-LABEL: {{^}}name: vop2_instructions -# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec -# GFX1010: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir index c027600a8af674..e2854df2468b39 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -147,15 +147,14 @@ body: | --- # GCN-LABEL: {{^}}name: vop2_instructions -# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec -# VI: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec + +# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec -# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec -# GFX9: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec +# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir index 467bc77c185779..ffbd2d092b5d81 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir @@ -37,10 +37,9 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec ; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec ; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec - ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec - ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec - ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0) + ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 %2 = COPY $sgpr30_sgpr31 @@ -146,7 +145,7 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[FLAT_LOAD_DWORD]], implicit $exec ; SDWA-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 65535 ; SDWA-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[FLAT_LOAD_DWORD]], killed [[S_MOV_B32_]], implicit $exec - ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[V_AND_B32_e64_]](tied-def 0) + ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[FLAT_LOAD_DWORD]](tied-def 0) ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_MOV_B32_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: S_ENDPGM 0 %2 = COPY $sgpr30_sgpr31 @@ -181,17 +180,15 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec ; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec ; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec - ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec ; SDWA-NEXT: {{ $}} ; SDWA-NEXT: bb.1: ; SDWA-NEXT: successors: %bb.2(0x80000000) ; SDWA-NEXT: {{ $}} - ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 6, 0, 1, 3, implicit $mode, implicit $exec - ; SDWA-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[V_MUL_F32_sdwa]], implicit $exec + ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec ; SDWA-NEXT: {{ $}} ; SDWA-NEXT: bb.2: - ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0) + ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 bb.0: From 4d3a5309248e167021913736dfd5276ee536f4ce Mon Sep 17 00:00:00 2001 From: Danial Klimkin Date: Thu, 23 Jan 2025 15:21:33 +0100 Subject: [PATCH 146/208] [bazel]Fix(2) bazel build past 2e6cc79f816d942ab09d6a310cd925c1da148aa9 (#124118) Fix caused link errors downstream. --- .../llvm-project-overlay/mlir/BUILD.bazel | 39 +++++++++++++++---- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index d9f222982bc010..72c28faed1d168 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5549,7 +5549,11 @@ cc_library( [ "lib/Dialect/LLVMIR/Transforms/*.cpp", ], - exclude = ["lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp"], + exclude = [ + "lib/Dialect/LLVMIR/Transforms/DIExpressionLegalization.cpp", + "lib/Dialect/LLVMIR/Transforms/DIExpressionRewriter.cpp", + "lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp", + ], ), hdrs = glob( [ @@ -5583,20 +5587,40 @@ cc_library( cc_library( name = "LLVMIRTransformsLegalizeForExport", srcs = ["lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp"], - hdrs = [ - "include/mlir/Dialect/LLVMIR/Transforms/DIExpressionLegalization.h", - "include/mlir/Dialect/LLVMIR/Transforms/DIExpressionRewriter.h", - "include/mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h", - ], + hdrs = ["include/mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h"], includes = ["include"], deps = [ ":IR", - ":LLVMPassIncGen", ":LLVMDialect", + ":LLVMPassIncGen", + ":LLVMIRTransformsDIExpressionLegalization", ":Pass", ], ) +cc_library( + name = "LLVMIRTransformsDIExpressionLegalization", + srcs = ["lib/Dialect/LLVMIR/Transforms/DIExpressionLegalization.cpp"], + hdrs = ["include/mlir/Dialect/LLVMIR/Transforms/DIExpressionLegalization.h"], + includes = ["include"], + deps = [ + ":LLVMIRTransformsDIExpressionRewriter", + "//llvm:BinaryFormat", + ], +) + +cc_library( + name = "LLVMIRTransformsDIExpressionRewriter", + srcs = ["lib/Dialect/LLVMIR/Transforms/DIExpressionRewriter.cpp"], + hdrs = ["include/mlir/Dialect/LLVMIR/Transforms/DIExpressionRewriter.h"], + includes = ["include"], + deps = [ + ":LLVMDialect", + ":TransformUtils", + "//llvm:Support", + ], +) + td_library( name = "GPUOpsTdFiles", srcs = [ @@ -9244,6 +9268,7 @@ cc_library( ":LLVMConversionIncGen", ":LLVMDialect", ":LLVMIntrinsicConversionIncGen", + ":LLVMIRTransformsDIExpressionLegalization", ":LLVMIRTransformsLegalizeForExport", ":OpenMPDialect", ":Support", From 25653e558c292e9582d8132134af47a1af55499b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= Date: Thu, 23 Jan 2025 15:23:56 +0100 Subject: [PATCH 147/208] [AVX10.2] Update convert chapter intrinsic and mnemonics names (#123656) Intel spec for avx10.2 (https://cdrdv2.intel.com/v1/dl/getContent/828965) has been updated. This PR changes relevant names from the "AVX10 CONVERT INSTRUCTIONS" chapter . --- clang/include/clang/Basic/BuiltinsX86.td | 48 +- clang/lib/Headers/avx10_2_512convertintrin.h | 157 ++-- clang/lib/Headers/avx10_2convertintrin.h | 310 ++++--- .../CodeGen/X86/avx10_2_512convert-builtins.c | 300 +++---- .../CodeGen/X86/avx10_2convert-builtins.c | 600 ++++++------- llvm/include/llvm/IR/IntrinsicsX86.td | 48 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +- llvm/lib/Target/X86/X86ISelLowering.h | 24 +- llvm/lib/Target/X86/X86InstrAVX10.td | 42 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 24 +- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 98 +-- .../X86/avx10_2_512convert-intrinsics.ll | 272 +++--- .../CodeGen/X86/avx10_2convert-intrinsics.ll | 544 ++++++------ .../MC/Disassembler/X86/avx10.2convert-32.txt | 832 +++++++++--------- .../MC/Disassembler/X86/avx10.2convert-64.txt | 832 +++++++++--------- llvm/test/MC/X86/avx10.2convert-32-att.s | 832 +++++++++--------- llvm/test/MC/X86/avx10.2convert-32-intel.s | 832 +++++++++--------- llvm/test/MC/X86/avx10.2convert-64-att.s | 832 +++++++++--------- llvm/test/MC/X86/avx10.2convert-64-intel.s | 832 +++++++++--------- llvm/test/TableGen/x86-fold-tables.inc | 288 +++--- 20 files changed, 3879 insertions(+), 3892 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 0c9173f9bfccea..b3494a422349ed 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -5191,51 +5191,51 @@ let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] i } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtne2ph2bf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; + def vcvt2ph2bf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtne2ph2bf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; + def vcvt2ph2bf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtne2ph2bf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; + def vcvt2ph2bf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtne2ph2bf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; + def vcvt2ph2bf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtne2ph2bf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; + def vcvt2ph2bf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtne2ph2bf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; + def vcvt2ph2bf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtne2ph2hf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; + def vcvt2ph2hf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtne2ph2hf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; + def vcvt2ph2hf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtne2ph2hf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; + def vcvt2ph2hf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtne2ph2hf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; + def vcvt2ph2hf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtne2ph2hf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; + def vcvt2ph2hf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtne2ph2hf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; + def vcvt2ph2hf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { @@ -5251,51 +5251,51 @@ let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] i } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtneph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; + def vcvtph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtneph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; + def vcvtph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtneph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; + def vcvtph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtneph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; + def vcvtph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtneph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; + def vcvtph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtneph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; + def vcvtph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtneph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; + def vcvtph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtneph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; + def vcvtph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtneph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; + def vcvtph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vcvtneph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; + def vcvtph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vcvtneph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; + def vcvtph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">; } let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vcvtneph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; + def vcvtph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">; } let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { diff --git a/clang/lib/Headers/avx10_2_512convertintrin.h b/clang/lib/Headers/avx10_2_512convertintrin.h index 60a5b1ef4548d8..0b5fca5cda5228 100644 --- a/clang/lib/Headers/avx10_2_512convertintrin.h +++ b/clang/lib/Headers/avx10_2_512convertintrin.h @@ -58,263 +58,258 @@ _mm512_maskz_cvtx2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) { (__mmask32)(U), (const int)(R))) static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtbiasph_pbf8(__m512i __A, __m512h __B) { +_mm512_cvtbiasph_bf8(__m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_pbf8( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_bf8( __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtbiasph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { +_mm512_maskz_cvtbiasph_bf8(__mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtbiassph_pbf8(__m512i __A, __m512h __B) { +_mm512_cvtbiassph_bf8(__m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_pbf8( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_bf8( __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtbiassph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { +_mm512_maskz_cvtbiassph_bf8(__mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtbiasph_phf8(__m512i __A, __m512h __B) { +_mm512_cvtbiasph_hf8(__m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_phf8( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_hf8( __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtbiasph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { +_mm512_maskz_cvtbiasph_hf8(__mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtbiassph_phf8(__m512i __A, __m512h __B) { +_mm512_cvtbiassph_hf8(__m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_phf8( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_hf8( __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtbiassph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { +_mm512_maskz_cvtbiassph_hf8(__mmask32 __U, __m512i __A, __m512h __B) { return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtne2ph_pbf8(__m512h __A, __m512h __B) { - return (__m512i)__builtin_ia32_vcvtne2ph2bf8_512((__v32hf)(__A), - (__v32hf)(__B)); +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_bf8(__m512h __A, + __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2bf8_512((__v32hf)(__A), + (__v32hf)(__B)); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ph_pbf8( - __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvt2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_pbf8(__A, __B), (__v64qi)__W); + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B), (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtne2ph_pbf8(__mmask64 __U, __m512h __A, __m512h __B) { +_mm512_maskz_cvt2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_pbf8(__A, __B), + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B), (__v64qi)(__m512i)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtnes2ph_pbf8(__m512h __A, __m512h __B) { - return (__m512i)__builtin_ia32_vcvtne2ph2bf8s_512((__v32hf)(__A), - (__v32hf)(__B)); +_mm512_cvts2ph_bf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2bf8s_512((__v32hf)(__A), + (__v32hf)(__B)); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtnes2ph_pbf8( - __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvts2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_pbf8(__A, __B), (__v64qi)__W); + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B), (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtnes2ph_pbf8(__mmask64 __U, __m512h __A, __m512h __B) { +_mm512_maskz_cvts2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_pbf8(__A, __B), + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B), (__v64qi)(__m512i)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtne2ph_phf8(__m512h __A, __m512h __B) { - return (__m512i)__builtin_ia32_vcvtne2ph2hf8_512((__v32hf)(__A), - (__v32hf)(__B)); +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_hf8(__m512h __A, + __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2hf8_512((__v32hf)(__A), + (__v32hf)(__B)); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ph_phf8( - __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvt2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_phf8(__A, __B), (__v64qi)__W); + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B), (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtne2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { +_mm512_maskz_cvt2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_phf8(__A, __B), + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B), (__v64qi)(__m512i)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtnes2ph_phf8(__m512h __A, __m512h __B) { - return (__m512i)__builtin_ia32_vcvtne2ph2hf8s_512((__v32hf)(__A), - (__v32hf)(__B)); +_mm512_cvts2ph_hf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2hf8s_512((__v32hf)(__A), + (__v32hf)(__B)); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtnes2ph_phf8( - __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvts2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_phf8(__A, __B), (__v64qi)__W); + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B), (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtnes2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { +_mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { return (__m512i)__builtin_ia32_selectb_512( - (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_phf8(__A, __B), + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B), (__v64qi)(__m512i)_mm512_setzero_si512()); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_cvtnehf8_ph(__m256i __A) { +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8(__m256i __A) { return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtnehf8_ph(__m512h __W, __mmask32 __U, __m256i __A) { +_mm512_mask_cvthf8(__m512h __W, __mmask32 __U, __m256i __A) { return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtnehf8_ph(__mmask32 __U, __m256i __A) { +_mm512_maskz_cvthf8(__mmask32 __U, __m256i __A) { return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtneph_pbf8(__m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_bf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtneph_pbf8(__m256i __W, __mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask( +_mm512_mask_cvtph_bf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask( (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtneph_pbf8(__mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask( +_mm512_maskz_cvtph_bf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtnesph_pbf8(__m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_bf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtnesph_pbf8(__m256i __W, __mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask( +_mm512_mask_cvtsph_bf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask( (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtnesph_pbf8(__mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask( +_mm512_maskz_cvtsph_bf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtneph_phf8(__m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_hf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtneph_phf8(__m256i __W, __mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask( +_mm512_mask_cvtph_hf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask( (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtneph_phf8(__mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask( +_mm512_maskz_cvtph_hf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtnesph_phf8(__m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask( +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_hf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtnesph_phf8(__m256i __W, __mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask( +_mm512_mask_cvtsph_hf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask( (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtnesph_phf8(__mmask32 __U, __m512h __A) { - return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask( +_mm512_maskz_cvtsph_hf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask( (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); } -static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtpbf8_ph(__m256i __A) { +static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtbf8_ph(__m256i __A) { return _mm512_castsi512_ph(_mm512_slli_epi16(_mm512_cvtepi8_epi16(__A), 8)); } static __inline __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpbf8_ph(__m512h __S, __mmask32 __U, __m256i __A) { +_mm512_mask_cvtbf8_ph(__m512h __S, __mmask32 __U, __m256i __A) { return _mm512_castsi512_ph( _mm512_mask_slli_epi16((__m512i)__S, __U, _mm512_cvtepi8_epi16(__A), 8)); } static __inline __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpbf8_ph(__mmask32 __U, __m256i __A) { +_mm512_maskz_cvtbf8_ph(__mmask32 __U, __m256i __A) { return _mm512_castsi512_ph( _mm512_slli_epi16(_mm512_maskz_cvtepi8_epi16(__U, __A), 8)); } diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h index efe8477cbbf9be..c67a5b890f1957 100644 --- a/clang/lib/Headers/avx10_2convertintrin.h +++ b/clang/lib/Headers/avx10_2convertintrin.h @@ -77,516 +77,508 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) { (__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()), \ (__mmask16)(U), (const int)(R))) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtbiasph_pbf8(__m128i __A, __m128h __B) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_bf8(__m128i __A, + __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtbiasph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { +_mm_mask_cvtbiasph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtbiasph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { +_mm_maskz_cvtbiasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtbiasph_pbf8(__m256i __A, __m256h __B) { +_mm256_cvtbiasph_bf8(__m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_pbf8( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_bf8( __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtbiasph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { +_mm256_maskz_cvtbiasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtbiassph_pbf8(__m128i __A, __m128h __B) { +_mm_cvtbiassph_bf8(__m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtbiassph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { +_mm_mask_cvtbiassph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtbiassph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { +_mm_maskz_cvtbiassph_bf8(__mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtbiassph_pbf8(__m256i __A, __m256h __B) { +_mm256_cvtbiassph_bf8(__m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_pbf8( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_bf8( __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtbiassph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { +_mm256_maskz_cvtbiassph_bf8(__mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtbiasph_phf8(__m128i __A, __m128h __B) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_hf8(__m128i __A, + __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtbiasph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { +_mm_mask_cvtbiasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtbiasph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { +_mm_maskz_cvtbiasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtbiasph_phf8(__m256i __A, __m256h __B) { +_mm256_cvtbiasph_hf8(__m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_phf8( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_hf8( __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtbiasph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { +_mm256_maskz_cvtbiasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtbiassph_phf8(__m128i __A, __m128h __B) { +_mm_cvtbiassph_hf8(__m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtbiassph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { +_mm_mask_cvtbiassph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtbiassph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { +_mm_maskz_cvtbiassph_hf8(__mmask8 __U, __m128i __A, __m128h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtbiassph_phf8(__m256i __A, __m256h __B) { +_mm256_cvtbiassph_hf8(__m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_phf8( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_hf8( __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtbiassph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { +_mm256_maskz_cvtbiassph_hf8(__mmask16 __U, __m256i __A, __m256h __B) { return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtne2ph_pbf8(__m128h __A, - __m128h __B) { - return (__m128i)__builtin_ia32_vcvtne2ph2bf8_128((__v8hf)(__A), - (__v8hf)(__B)); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_bf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2bf8_128((__v8hf)(__A), (__v8hf)(__B)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtne2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { +_mm_mask_cvt2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtne2ph_pbf8(__A, __B), (__v16qi)__W); + (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtne2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { +_mm_maskz_cvt2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtne2ph_pbf8(__A, __B), + (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), (__v16qi)(__m128i)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtne2ph_pbf8(__m256h __A, __m256h __B) { - return (__m256i)__builtin_ia32_vcvtne2ph2bf8_256((__v16hf)(__A), - (__v16hf)(__B)); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_bf8(__m256h __A, + __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2bf8_256((__v16hf)(__A), + (__v16hf)(__B)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ph_pbf8( - __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_pbf8(__A, __B), (__v32qi)__W); + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtne2ph_pbf8(__mmask32 __U, __m256h __A, __m256h __B) { +_mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_pbf8(__A, __B), + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)(__m256i)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtnes2ph_pbf8(__m128h __A, __m128h __B) { - return (__m128i)__builtin_ia32_vcvtne2ph2bf8s_128((__v8hf)(__A), - (__v8hf)(__B)); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_bf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2bf8s_128((__v8hf)(__A), (__v8hf)(__B)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtnes2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { +_mm_mask_cvts2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_pbf8(__A, __B), (__v16qi)__W); + (__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B), (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtnes2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { +_mm_maskz_cvts2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_pbf8(__A, __B), + (__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B), (__v16qi)(__m128i)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtnes2ph_pbf8(__m256h __A, __m256h __B) { - return (__m256i)__builtin_ia32_vcvtne2ph2bf8s_256((__v16hf)(__A), - (__v16hf)(__B)); +_mm256_cvts2ph_bf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2bf8s_256((__v16hf)(__A), + (__v16hf)(__B)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtnes2ph_pbf8( - __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_pbf8(__A, __B), (__v32qi)__W); + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtnes2ph_pbf8(__mmask32 __U, __m256h __A, __m256h __B) { +_mm256_maskz_cvts2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_pbf8(__A, __B), + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), (__v32qi)(__m256i)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtne2ph_phf8(__m128h __A, - __m128h __B) { - return (__m128i)__builtin_ia32_vcvtne2ph2hf8_128((__v8hf)(__A), - (__v8hf)(__B)); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_hf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2hf8_128((__v8hf)(__A), (__v8hf)(__B)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtne2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { +_mm_mask_cvt2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtne2ph_phf8(__A, __B), (__v16qi)__W); + (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtne2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { +_mm_maskz_cvt2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtne2ph_phf8(__A, __B), + (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), (__v16qi)(__m128i)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtne2ph_phf8(__m256h __A, __m256h __B) { - return (__m256i)__builtin_ia32_vcvtne2ph2hf8_256((__v16hf)(__A), - (__v16hf)(__B)); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_hf8(__m256h __A, + __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2hf8_256((__v16hf)(__A), + (__v16hf)(__B)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ph_phf8( - __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_phf8(__A, __B), (__v32qi)__W); + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtne2ph_phf8(__mmask32 __U, __m256h __A, __m256h __B) { +_mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_phf8(__A, __B), + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)(__m256i)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtnes2ph_phf8(__m128h __A, __m128h __B) { - return (__m128i)__builtin_ia32_vcvtne2ph2hf8s_128((__v8hf)(__A), - (__v8hf)(__B)); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_hf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2hf8s_128((__v8hf)(__A), (__v8hf)(__B)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtnes2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { +_mm_mask_cvts2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_phf8(__A, __B), (__v16qi)__W); + (__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B), (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtnes2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { +_mm_maskz_cvts2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) { return (__m128i)__builtin_ia32_selectb_128( - (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_phf8(__A, __B), + (__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B), (__v16qi)(__m128i)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtnes2ph_phf8(__m256h __A, __m256h __B) { - return (__m256i)__builtin_ia32_vcvtne2ph2hf8s_256((__v16hf)(__A), - (__v16hf)(__B)); +_mm256_cvts2ph_hf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2hf8s_256((__v16hf)(__A), + (__v16hf)(__B)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtnes2ph_phf8( - __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_phf8(__A, __B), (__v32qi)__W); + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtnes2ph_phf8(__mmask32 __U, __m256h __A, __m256h __B) { +_mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) { return (__m256i)__builtin_ia32_selectb_256( - (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_phf8(__A, __B), + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), (__v32qi)(__m256i)_mm256_setzero_si256()); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtnehf8_ph(__m128i __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8(__m128i __A) { return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtnehf8_ph(__m128h __W, __mmask8 __U, __m128i __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvthf8(__m128h __W, + __mmask8 __U, + __m128i __A) { return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtnehf8_ph(__mmask8 __U, __m128i __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvthf8(__mmask8 __U, + __m128i __A) { return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_cvtnehf8_ph(__m128i __A) { +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8(__m128i __A) { return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtnehf8_ph(__m256h __W, __mmask16 __U, __m128i __A) { +_mm256_mask_cvthf8(__m256h __W, __mmask16 __U, __m128i __A) { return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtnehf8_ph(__mmask16 __U, __m128i __A) { +_mm256_maskz_cvthf8(__mmask16 __U, __m128i __A) { return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtneph_pbf8(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8_128_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_bf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtneph_pbf8(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8_128_mask( +_mm_mask_cvtph_bf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask( (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtneph_pbf8(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8_128_mask( +_mm_maskz_cvtph_bf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtneph_pbf8(__m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8_256_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_bf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtneph_pbf8(__m128i __W, __mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8_256_mask( +_mm256_mask_cvtph_bf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask( (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtneph_pbf8(__mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8_256_mask( +_mm256_maskz_cvtph_bf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtnesph_pbf8(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8s_128_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_bf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtnesph_pbf8(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8s_128_mask( +_mm_mask_cvtsph_bf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask( (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtnesph_pbf8(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8s_128_mask( +_mm_maskz_cvtsph_bf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtnesph_pbf8(__m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8s_256_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_bf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtnesph_pbf8(__m128i __W, __mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8s_256_mask( +_mm256_mask_cvtsph_bf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask( (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtnesph_pbf8(__mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2bf8s_256_mask( +_mm256_maskz_cvtsph_bf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtneph_phf8(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8_128_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_hf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtneph_phf8(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8_128_mask( +_mm_mask_cvtph_hf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask( (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtneph_phf8(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8_128_mask( +_mm_maskz_cvtph_hf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtneph_phf8(__m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8_256_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_hf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtneph_phf8(__m128i __W, __mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8_256_mask( +_mm256_mask_cvtph_hf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask( (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtneph_phf8(__mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8_256_mask( +_mm256_maskz_cvtph_hf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtnesph_phf8(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8s_128_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_hf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtnesph_phf8(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8s_128_mask( +_mm_mask_cvtsph_hf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask( (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtnesph_phf8(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8s_128_mask( +_mm_maskz_cvtsph_hf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask( (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtnesph_phf8(__m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8s_256_mask( +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_hf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtnesph_phf8(__m128i __W, __mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8s_256_mask( +_mm256_mask_cvtsph_hf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask( (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtnesph_phf8(__mmask16 __U, __m256h __A) { - return (__m128i)__builtin_ia32_vcvtneph2hf8s_256_mask( +_mm256_maskz_cvtsph_hf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask( (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpbf8_ph(__m128i __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtbf8_ph(__m128i __A) { return _mm_castsi128_ph(_mm_slli_epi16(_mm_cvtepi8_epi16(__A), 8)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpbf8_ph(__m128h __S, __mmask8 __U, __m128i __A) { +_mm_mask_cvtbf8_ph(__m128h __S, __mmask8 __U, __m128i __A) { return _mm_castsi128_ph( _mm_mask_slli_epi16((__m128i)__S, __U, _mm_cvtepi8_epi16(__A), 8)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpbf8_ph(__mmask8 __U, __m128i __A) { +_mm_maskz_cvtbf8_ph(__mmask8 __U, __m128i __A) { return _mm_castsi128_ph(_mm_slli_epi16(_mm_maskz_cvtepi8_epi16(__U, __A), 8)); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtpbf8_ph(__m128i __A) { +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtbf8_ph(__m128i __A) { return _mm256_castsi256_ph(_mm256_slli_epi16(_mm256_cvtepi8_epi16(__A), 8)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpbf8_ph(__m256h __S, __mmask16 __U, __m128i __A) { +_mm256_mask_cvtbf8_ph(__m256h __S, __mmask16 __U, __m128i __A) { return _mm256_castsi256_ph( _mm256_mask_slli_epi16((__m256i)__S, __U, _mm256_cvtepi8_epi16(__A), 8)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpbf8_ph(__mmask16 __U, __m128i __A) { +_mm256_maskz_cvtbf8_ph(__mmask16 __U, __m128i __A) { return _mm256_castsi256_ph( _mm256_slli_epi16(_mm256_maskz_cvtepi8_epi16(__U, __A), 8)); } diff --git a/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c index 6662e0cbf8a913..22503c640a727f 100644 --- a/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c @@ -41,278 +41,278 @@ __m512h test_mm512_maskz_cvtx_round2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) return _mm512_maskz_cvtx_round2ps_ph(__U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -__m256i test_mm512_cvtbiasph_pbf8(__m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtbiasph_pbf8( +__m256i test_mm512_cvtbiasph_bf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiasph_bf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512( - return _mm512_cvtbiasph_pbf8(__A, __B); + return _mm512_cvtbiasph_bf8(__A, __B); } -__m256i test_mm512_mask_cvtbiasph_pbf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtbiasph_pbf8( +__m256i test_mm512_mask_cvtbiasph_bf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiasph_bf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512( - return _mm512_mask_cvtbiasph_pbf8(__W, __U, __A, __B); + return _mm512_mask_cvtbiasph_bf8(__W, __U, __A, __B); } -__m256i test_mm512_maskz_cvtbiasph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtbiasph_pbf8( +__m256i test_mm512_maskz_cvtbiasph_bf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiasph_bf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512( - return _mm512_maskz_cvtbiasph_pbf8(__U, __A, __B); + return _mm512_maskz_cvtbiasph_bf8(__U, __A, __B); } -__m256i test_mm512_cvtbiassph_pbf8(__m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtbiassph_pbf8( +__m256i test_mm512_cvtbiassph_bf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiassph_bf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512( - return _mm512_cvtbiassph_pbf8(__A, __B); + return _mm512_cvtbiassph_bf8(__A, __B); } -__m256i test_mm512_mask_cvtbiassph_pbf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtbiassph_pbf8( +__m256i test_mm512_mask_cvtbiassph_bf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiassph_bf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512( - return _mm512_mask_cvtbiassph_pbf8(__W, __U, __A, __B); + return _mm512_mask_cvtbiassph_bf8(__W, __U, __A, __B); } -__m256i test_mm512_maskz_cvtbiassph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtbiassph_pbf8( +__m256i test_mm512_maskz_cvtbiassph_bf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiassph_bf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512( - return _mm512_maskz_cvtbiassph_pbf8(__U, __A, __B); + return _mm512_maskz_cvtbiassph_bf8(__U, __A, __B); } -__m256i test_mm512_cvtbiasph_phf8(__m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtbiasph_phf8( +__m256i test_mm512_cvtbiasph_hf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiasph_hf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512( - return _mm512_cvtbiasph_phf8(__A, __B); + return _mm512_cvtbiasph_hf8(__A, __B); } -__m256i test_mm512_mask_cvtbiasph_phf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtbiasph_phf8( +__m256i test_mm512_mask_cvtbiasph_hf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiasph_hf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512( - return _mm512_mask_cvtbiasph_phf8(__W, __U, __A, __B); + return _mm512_mask_cvtbiasph_hf8(__W, __U, __A, __B); } -__m256i test_mm512_maskz_cvtbiasph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtbiasph_phf8( +__m256i test_mm512_maskz_cvtbiasph_hf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiasph_hf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512( - return _mm512_maskz_cvtbiasph_phf8(__U, __A, __B); + return _mm512_maskz_cvtbiasph_hf8(__U, __A, __B); } -__m256i test_mm512_cvtbiassph_phf8(__m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtbiassph_phf8( +__m256i test_mm512_cvtbiassph_hf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiassph_hf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512( - return _mm512_cvtbiassph_phf8(__A, __B); + return _mm512_cvtbiassph_hf8(__A, __B); } -__m256i test_mm512_mask_cvtbiassph_phf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtbiassph_phf8( +__m256i test_mm512_mask_cvtbiassph_hf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiassph_hf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512( - return _mm512_mask_cvtbiassph_phf8(__W, __U, __A, __B); + return _mm512_mask_cvtbiassph_hf8(__W, __U, __A, __B); } -__m256i test_mm512_maskz_cvtbiassph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtbiassph_phf8( +__m256i test_mm512_maskz_cvtbiassph_hf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiassph_hf8( // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512( - return _mm512_maskz_cvtbiassph_phf8(__U, __A, __B); + return _mm512_maskz_cvtbiassph_hf8(__U, __A, __B); } -__m512i test_mm512_cvtne2ph_pbf8(__m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtne2ph_pbf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512( - return _mm512_cvtne2ph_pbf8(__A, __B); +__m512i test_mm512_cvt2ph_bf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvt2ph_bf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8512( + return _mm512_cvt2ph_bf8(__A, __B); } -__m512i test_mm512_mask_cvtne2ph_pbf8(__m512i __W, __mmask32 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtne2ph_pbf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512( +__m512i test_mm512_mask_cvt2ph_bf8(__m512i __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvt2ph_bf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8512( // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} // CHECK: ret <8 x i64> %{{.*}} - return _mm512_mask_cvtne2ph_pbf8(__W, __U, __A, __B); + return _mm512_mask_cvt2ph_bf8(__W, __U, __A, __B); } -__m512i test_mm512_maskz_cvtne2ph_pbf8(__mmask32 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtne2ph_pbf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512( +__m512i test_mm512_maskz_cvt2ph_bf8(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvt2ph_bf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8512( // CHECK: zeroinitializer // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} - return _mm512_maskz_cvtne2ph_pbf8(__U, __A, __B); + return _mm512_maskz_cvt2ph_bf8(__U, __A, __B); } -__m512i test_mm512_cvtnes2ph_pbf8(__m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtnes2ph_pbf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512( - return _mm512_cvtnes2ph_pbf8(__A, __B); +__m512i test_mm512_cvts2ph_bf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvts2ph_bf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8s512( + return _mm512_cvts2ph_bf8(__A, __B); } -__m512i test_mm512_mask_cvtnes2ph_pbf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtnes2ph_pbf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512( +__m512i test_mm512_mask_cvts2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvts2ph_bf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8s512( // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} // CHECK: ret <8 x i64> %{{.*}} - return _mm512_mask_cvtnes2ph_pbf8(__W, __U, __A, __B); + return _mm512_mask_cvts2ph_bf8(__W, __U, __A, __B); } -__m512i test_mm512_maskz_cvtnes2ph_pbf8(__mmask64 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtnes2ph_pbf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512( +__m512i test_mm512_maskz_cvts2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvts2ph_bf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8s512( // CHECK: zeroinitializer // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} - return _mm512_maskz_cvtnes2ph_pbf8(__U, __A, __B); + return _mm512_maskz_cvts2ph_bf8(__U, __A, __B); } -__m512i test_mm512_cvtne2ph_phf8(__m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtne2ph_phf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512( - return _mm512_cvtne2ph_phf8(__A, __B); +__m512i test_mm512_cvt2ph_hf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvt2ph_hf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8512( + return _mm512_cvt2ph_hf8(__A, __B); } -__m512i test_mm512_mask_cvtne2ph_phf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtne2ph_phf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512( +__m512i test_mm512_mask_cvt2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvt2ph_hf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8512( // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} // CHECK: ret <8 x i64> %{{.*}} - return _mm512_mask_cvtne2ph_phf8(__W, __U, __A, __B); + return _mm512_mask_cvt2ph_hf8(__W, __U, __A, __B); } -__m512i test_mm512_maskz_cvtne2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtne2ph_phf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512( +__m512i test_mm512_maskz_cvt2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvt2ph_hf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8512( // CHECK: zeroinitializer // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} - return _mm512_maskz_cvtne2ph_phf8(__U, __A, __B); + return _mm512_maskz_cvt2ph_hf8(__U, __A, __B); } -__m512i test_mm512_cvtnes2ph_phf8(__m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_cvtnes2ph_phf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512( - return _mm512_cvtnes2ph_phf8(__A, __B); +__m512i test_mm512_cvts2ph_hf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvts2ph_hf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8s512( + return _mm512_cvts2ph_hf8(__A, __B); } -__m512i test_mm512_mask_cvtnes2ph_phf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_mask_cvtnes2ph_phf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512( +__m512i test_mm512_mask_cvts2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvts2ph_hf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8s512( // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} // CHECK: ret <8 x i64> %{{.*}} - return _mm512_mask_cvtnes2ph_phf8(__W, __U, __A, __B); + return _mm512_mask_cvts2ph_hf8(__W, __U, __A, __B); } -__m512i test_mm512_maskz_cvtnes2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtnes2ph_phf8( - // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512( +__m512i test_mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvts2ph_hf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8s512( // CHECK: zeroinitializer // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} - return _mm512_maskz_cvtnes2ph_phf8(__U, __A, __B); + return _mm512_maskz_cvts2ph_hf8(__U, __A, __B); } -__m512h test_mm512_cvtnehf8_ph(__m256i __A) { - // CHECK-LABEL: @test_mm512_cvtnehf8_ph( +__m512h test_mm512_cvthf8(__m256i __A) { + // CHECK-LABEL: @test_mm512_cvthf8( // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( - return _mm512_cvtnehf8_ph(__A); + return _mm512_cvthf8(__A); } -__m512h test_mm512_mask_cvtnehf8_ph(__m512h __A, __mmask32 __B, __m256i __C) { - // CHECK-LABEL: @test_mm512_mask_cvtnehf8_ph( +__m512h test_mm512_mask_cvthf8(__m512h __A, __mmask32 __B, __m256i __C) { + // CHECK-LABEL: @test_mm512_mask_cvthf8( // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( - return _mm512_mask_cvtnehf8_ph(__A, __B, __C); + return _mm512_mask_cvthf8(__A, __B, __C); } -__m512h test_mm512_maskz_cvtnehf8_ph(__mmask32 __A, __m256i __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtnehf8_ph( +__m512h test_mm512_maskz_cvthf8(__mmask32 __A, __m256i __B) { + // CHECK-LABEL: @test_mm512_maskz_cvthf8( // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( - return _mm512_maskz_cvtnehf8_ph(__A, __B); + return _mm512_maskz_cvthf8(__A, __B); } -__m256i test_mm512_cvtneph_pbf8(__m512h __A) { - // CHECK-LABEL: @test_mm512_cvtneph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512( - return _mm512_cvtneph_pbf8(__A); +__m256i test_mm512_cvtph_bf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8512( + return _mm512_cvtph_bf8(__A); } -__m256i test_mm512_mask_cvtneph_pbf8(__m256i __A, __mmask32 __B, __m512h __C) { - // CHECK-LABEL: @test_mm512_mask_cvtneph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512( - return _mm512_mask_cvtneph_pbf8(__A, __B, __C); +__m256i test_mm512_mask_cvtph_bf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8512( + return _mm512_mask_cvtph_bf8(__A, __B, __C); } -__m256i test_mm512_maskz_cvtneph_pbf8(__mmask32 __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtneph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512( - return _mm512_maskz_cvtneph_pbf8(__A, __B); +__m256i test_mm512_maskz_cvtph_bf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8512( + return _mm512_maskz_cvtph_bf8(__A, __B); } -__m256i test_mm512_cvtnesph_pbf8(__m512h __A) { - // CHECK-LABEL: @test_mm512_cvtnesph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512( - return _mm512_cvtnesph_pbf8(__A); +__m256i test_mm512_cvtsph_bf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtsph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s512( + return _mm512_cvtsph_bf8(__A); } -__m256i test_mm512_mask_cvtnesph_pbf8(__m256i __A, __mmask32 __B, __m512h __C) { - // CHECK-LABEL: @test_mm512_mask_cvtnesph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512( - return _mm512_mask_cvtnesph_pbf8(__A, __B, __C); +__m256i test_mm512_mask_cvtsph_bf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtsph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s512( + return _mm512_mask_cvtsph_bf8(__A, __B, __C); } -__m256i test_mm512_maskz_cvtnesph_pbf8(__mmask32 __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtnesph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512( - return _mm512_maskz_cvtnesph_pbf8(__A, __B); +__m256i test_mm512_maskz_cvtsph_bf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtsph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s512( + return _mm512_maskz_cvtsph_bf8(__A, __B); } -__m256i test_mm512_cvtneph_phf8(__m512h __A) { - // CHECK-LABEL: @test_mm512_cvtneph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512( - return _mm512_cvtneph_phf8(__A); +__m256i test_mm512_cvtph_hf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8512( + return _mm512_cvtph_hf8(__A); } -__m256i test_mm512_mask_cvtneph_phf8(__m256i __A, __mmask32 __B, __m512h __C) { - // CHECK-LABEL: @test_mm512_mask_cvtneph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512( - return _mm512_mask_cvtneph_phf8(__A, __B, __C); +__m256i test_mm512_mask_cvtph_hf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8512( + return _mm512_mask_cvtph_hf8(__A, __B, __C); } -__m256i test_mm512_maskz_cvtneph_phf8(__mmask32 __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtneph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512( - return _mm512_maskz_cvtneph_phf8(__A, __B); +__m256i test_mm512_maskz_cvtph_hf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8512( + return _mm512_maskz_cvtph_hf8(__A, __B); } -__m256i test_mm512_cvtnesph_phf8(__m512h __A) { - // CHECK-LABEL: @test_mm512_cvtnesph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512( - return _mm512_cvtnesph_phf8(__A); +__m256i test_mm512_cvtsph_hf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtsph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s512( + return _mm512_cvtsph_hf8(__A); } -__m256i test_mm512_mask_cvtnesph_phf8(__m256i __A, __mmask32 __B, __m512h __C) { - // CHECK-LABEL: @test_mm512_mask_cvtnesph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512( - return _mm512_mask_cvtnesph_phf8(__A, __B, __C); +__m256i test_mm512_mask_cvtsph_hf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtsph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s512( + return _mm512_mask_cvtsph_hf8(__A, __B, __C); } -__m256i test_mm512_maskz_cvtnesph_phf8(__mmask32 __A, __m512h __B) { - // CHECK-LABEL: @test_mm512_maskz_cvtnesph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512( - return _mm512_maskz_cvtnesph_phf8(__A, __B); +__m256i test_mm512_maskz_cvtsph_hf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtsph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s512( + return _mm512_maskz_cvtsph_hf8(__A, __B); } -__m512h test_mm512_cvtpbf8_ph(__m256i A) { - // CHECK-LABEL: @test_mm512_cvtpbf8_ph +__m512h test_mm512_cvtbf8_ph(__m256i A) { + // CHECK-LABEL: @test_mm512_cvtbf8_ph // CHECK: sext <32 x i8> %{{.*}} to <32 x i16> // CHECK: @llvm.x86.avx512.pslli.w.512 // CHECK: ret <32 x half> %{{.*}} - return _mm512_cvtpbf8_ph(A); + return _mm512_cvtbf8_ph(A); } -__m512h test_mm512_mask_cvtpbf8_ph(__m512h S, __mmask32 M, __m256i A) { - // CHECK-LABEL: @test_mm512_mask_cvtpbf8_ph +__m512h test_mm512_mask_cvtbf8_ph(__m512h S, __mmask32 M, __m256i A) { + // CHECK-LABEL: @test_mm512_mask_cvtbf8_ph // CHECK: sext <32 x i8> %{{.*}} to <32 x i16> // CHECK: @llvm.x86.avx512.pslli.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} // CHECK: ret <32 x half> %{{.*}} - return _mm512_mask_cvtpbf8_ph(S, M, A); + return _mm512_mask_cvtbf8_ph(S, M, A); } -__m512h test_mm512_maskz_cvtpbf8_ph(__mmask32 M, __m256i A) { - // CHECK-LABEL: @test_mm512_maskz_cvtpbf8_ph +__m512h test_mm512_maskz_cvtbf8_ph(__mmask32 M, __m256i A) { + // CHECK-LABEL: @test_mm512_maskz_cvtbf8_ph // CHECK: sext <32 x i8> %{{.*}} to <32 x i16> // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} // CHECK: @llvm.x86.avx512.pslli.w.512 // CHECK: ret <32 x half> %{{.*}} - return _mm512_maskz_cvtpbf8_ph(M, A); + return _mm512_maskz_cvtbf8_ph(M, A); } diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c index 8086c1b5d33993..efd9a31c40875a 100644 --- a/clang/test/CodeGen/X86/avx10_2convert-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c @@ -59,554 +59,554 @@ __m256h test_mm256_maskz_cvtx_round2ps_ph(__mmask8 __U, __m256 __A, __m256 __B) return _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } -__m128i test_mm_cvtbiasph_pbf8(__m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtbiasph_pbf8( +__m128i test_mm_cvtbiasph_bf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiasph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128( - return _mm_cvtbiasph_pbf8(__A, __B); + return _mm_cvtbiasph_bf8(__A, __B); } -__m128i test_mm_mask_cvtbiasph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtbiasph_pbf8( +__m128i test_mm_mask_cvtbiasph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiasph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128( - return _mm_mask_cvtbiasph_pbf8(__W, __U, __A, __B); + return _mm_mask_cvtbiasph_bf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtbiasph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtbiasph_pbf8( +__m128i test_mm_maskz_cvtbiasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiasph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128( - return _mm_maskz_cvtbiasph_pbf8(__U, __A, __B); + return _mm_maskz_cvtbiasph_bf8(__U, __A, __B); } -__m128i test_mm256_cvtbiasph_pbf8(__m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtbiasph_pbf8( +__m128i test_mm256_cvtbiasph_bf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiasph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256( - return _mm256_cvtbiasph_pbf8(__A, __B); + return _mm256_cvtbiasph_bf8(__A, __B); } -__m128i test_mm256_mask_cvtbiasph_pbf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtbiasph_pbf8( +__m128i test_mm256_mask_cvtbiasph_bf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiasph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256( - return _mm256_mask_cvtbiasph_pbf8(__W, __U, __A, __B); + return _mm256_mask_cvtbiasph_bf8(__W, __U, __A, __B); } -__m128i test_mm256_maskz_cvtbiasph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtbiasph_pbf8( +__m128i test_mm256_maskz_cvtbiasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiasph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256( - return _mm256_maskz_cvtbiasph_pbf8(__U, __A, __B); + return _mm256_maskz_cvtbiasph_bf8(__U, __A, __B); } -__m128i test_mm_cvtbiassph_pbf8(__m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtbiassph_pbf8( +__m128i test_mm_cvtbiassph_bf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiassph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128( - return _mm_cvtbiassph_pbf8(__A, __B); + return _mm_cvtbiassph_bf8(__A, __B); } -__m128i test_mm_mask_cvtbiassph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtbiassph_pbf8( +__m128i test_mm_mask_cvtbiassph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiassph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128( - return _mm_mask_cvtbiassph_pbf8(__W, __U, __A, __B); + return _mm_mask_cvtbiassph_bf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtbiassph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtbiassph_pbf8( +__m128i test_mm_maskz_cvtbiassph_bf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiassph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128( - return _mm_maskz_cvtbiassph_pbf8(__U, __A, __B); + return _mm_maskz_cvtbiassph_bf8(__U, __A, __B); } -__m128i test_mm256_cvtbiassph_pbf8(__m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtbiassph_pbf8( +__m128i test_mm256_cvtbiassph_bf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiassph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256( - return _mm256_cvtbiassph_pbf8(__A, __B); + return _mm256_cvtbiassph_bf8(__A, __B); } -__m128i test_mm256_mask_cvtbiassph_pbf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtbiassph_pbf8( +__m128i test_mm256_mask_cvtbiassph_bf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiassph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256( - return _mm256_mask_cvtbiassph_pbf8(__W, __U, __A, __B); + return _mm256_mask_cvtbiassph_bf8(__W, __U, __A, __B); } -__m128i test_mm256_maskz_cvtbiassph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtbiassph_pbf8( +__m128i test_mm256_maskz_cvtbiassph_bf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiassph_bf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256( - return _mm256_maskz_cvtbiassph_pbf8(__U, __A, __B); + return _mm256_maskz_cvtbiassph_bf8(__U, __A, __B); } -__m128i test_mm_cvtbiasph_phf8(__m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtbiasph_phf8( +__m128i test_mm_cvtbiasph_hf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiasph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128( - return _mm_cvtbiasph_phf8(__A, __B); + return _mm_cvtbiasph_hf8(__A, __B); } -__m128i test_mm_mask_cvtbiasph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtbiasph_phf8( +__m128i test_mm_mask_cvtbiasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiasph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128( - return _mm_mask_cvtbiasph_phf8(__W, __U, __A, __B); + return _mm_mask_cvtbiasph_hf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtbiasph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtbiasph_phf8( +__m128i test_mm_maskz_cvtbiasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiasph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128( - return _mm_maskz_cvtbiasph_phf8(__U, __A, __B); + return _mm_maskz_cvtbiasph_hf8(__U, __A, __B); } -__m128i test_mm256_cvtbiasph_phf8(__m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtbiasph_phf8( +__m128i test_mm256_cvtbiasph_hf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiasph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256( - return _mm256_cvtbiasph_phf8(__A, __B); + return _mm256_cvtbiasph_hf8(__A, __B); } -__m128i test_mm256_mask_cvtbiasph_phf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtbiasph_phf8( +__m128i test_mm256_mask_cvtbiasph_hf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiasph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256( - return _mm256_mask_cvtbiasph_phf8(__W, __U, __A, __B); + return _mm256_mask_cvtbiasph_hf8(__W, __U, __A, __B); } -__m128i test_mm256_maskz_cvtbiasph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtbiasph_phf8( +__m128i test_mm256_maskz_cvtbiasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiasph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256( - return _mm256_maskz_cvtbiasph_phf8(__U, __A, __B); + return _mm256_maskz_cvtbiasph_hf8(__U, __A, __B); } -__m128i test_mm_cvtbiassph_phf8(__m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtbiassph_phf8( +__m128i test_mm_cvtbiassph_hf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiassph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128( - return _mm_cvtbiassph_phf8(__A, __B); + return _mm_cvtbiassph_hf8(__A, __B); } -__m128i test_mm_mask_cvtbiassph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtbiassph_phf8( +__m128i test_mm_mask_cvtbiassph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiassph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128( - return _mm_mask_cvtbiassph_phf8(__W, __U, __A, __B); + return _mm_mask_cvtbiassph_hf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtbiassph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtbiassph_phf8( +__m128i test_mm_maskz_cvtbiassph_hf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiassph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128( - return _mm_maskz_cvtbiassph_phf8(__U, __A, __B); + return _mm_maskz_cvtbiassph_hf8(__U, __A, __B); } -__m128i test_mm256_cvtbiassph_phf8(__m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtbiassph_phf8( +__m128i test_mm256_cvtbiassph_hf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiassph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256( - return _mm256_cvtbiassph_phf8(__A, __B); + return _mm256_cvtbiassph_hf8(__A, __B); } -__m128i test_mm256_mask_cvtbiassph_phf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtbiassph_phf8( +__m128i test_mm256_mask_cvtbiassph_hf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiassph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256( - return _mm256_mask_cvtbiassph_phf8(__W, __U, __A, __B); + return _mm256_mask_cvtbiassph_hf8(__W, __U, __A, __B); } -__m128i test_mm256_maskz_cvtbiassph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtbiassph_phf8( +__m128i test_mm256_maskz_cvtbiassph_hf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiassph_hf8( // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256( - return _mm256_maskz_cvtbiassph_phf8(__U, __A, __B); + return _mm256_maskz_cvtbiassph_hf8(__U, __A, __B); } -__m128i test_mm_cvtne2ph_pbf8(__m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtne2ph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128( - return _mm_cvtne2ph_pbf8(__A, __B); +__m128i test_mm_cvt2ph_bf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvt2ph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8128( + return _mm_cvt2ph_bf8(__A, __B); } -__m128i test_mm_mask_cvtne2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtne2ph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128( +__m128i test_mm_mask_cvt2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvt2ph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8128( // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} // CHECK: ret <2 x i64> %{{.*}} - return _mm_mask_cvtne2ph_pbf8(__W, __U, __A, __B); + return _mm_mask_cvt2ph_bf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtne2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtne2ph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128( +__m128i test_mm_maskz_cvt2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvt2ph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8128( // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_cvtne2ph_pbf8(__U, __A, __B); + return _mm_maskz_cvt2ph_bf8(__U, __A, __B); } -__m256i test_mm256_cvtne2ph_pbf8(__m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtne2ph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256( - return _mm256_cvtne2ph_pbf8(__A, __B); +__m256i test_mm256_cvt2ph_bf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvt2ph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256( + return _mm256_cvt2ph_bf8(__A, __B); } -__m256i test_mm256_mask_cvtne2ph_pbf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtne2ph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256( +__m256i test_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvt2ph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256( // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} // CHECK: ret <4 x i64> %{{.*}} - return _mm256_mask_cvtne2ph_pbf8(__W, __U, __A, __B); + return _mm256_mask_cvt2ph_bf8(__W, __U, __A, __B); } -__m256i test_mm256_maskz_cvtne2ph_pbf8(__mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtne2ph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256( +__m256i test_mm256_maskz_cvt2ph_bf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvt2ph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256( // CHECK: zeroinitializer // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_cvtne2ph_pbf8(__U, __A, __B); + return _mm256_maskz_cvt2ph_bf8(__U, __A, __B); } -__m128i test_mm_cvtnes2ph_pbf8(__m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtnes2ph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128( - return _mm_cvtnes2ph_pbf8(__A, __B); +__m128i test_mm_cvts2ph_bf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvts2ph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8s128( + return _mm_cvts2ph_bf8(__A, __B); } -__m128i test_mm_mask_cvtnes2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtnes2ph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128( +__m128i test_mm_mask_cvts2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvts2ph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8s128( // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} // CHECK: ret <2 x i64> %{{.*}} - return _mm_mask_cvtnes2ph_pbf8(__W, __U, __A, __B); + return _mm_mask_cvts2ph_bf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtnes2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtnes2ph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128( +__m128i test_mm_maskz_cvts2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvts2ph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8s128( // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_cvtnes2ph_pbf8(__U, __A, __B); + return _mm_maskz_cvts2ph_bf8(__U, __A, __B); } -__m256i test_mm256_cvtnes2ph_pbf8(__m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtnes2ph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256( - return _mm256_cvtnes2ph_pbf8(__A, __B); +__m256i test_mm256_cvts2ph_bf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvts2ph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256( + return _mm256_cvts2ph_bf8(__A, __B); } -__m256i test_mm256_mask_cvtnes2ph_pbf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtnes2ph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256( +__m256i test_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvts2ph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256( // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} // CHECK: ret <4 x i64> %{{.*}} - return _mm256_mask_cvtnes2ph_pbf8(__W, __U, __A, __B); + return _mm256_mask_cvts2ph_bf8(__W, __U, __A, __B); } -__m256i test_mm256_maskz_cvtnes2ph_pbf8(__mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtnes2ph_pbf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256( +__m256i test_mm256_maskz_cvts2ph_bf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvts2ph_bf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256( // CHECK: zeroinitializer // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_cvtnes2ph_pbf8(__U, __A, __B); + return _mm256_maskz_cvts2ph_bf8(__U, __A, __B); } -__m128i test_mm_cvtne2ph_phf8(__m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtne2ph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128( - return _mm_cvtne2ph_phf8(__A, __B); +__m128i test_mm_cvt2ph_hf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvt2ph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8128( + return _mm_cvt2ph_hf8(__A, __B); } -__m128i test_mm_mask_cvtne2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtne2ph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128( +__m128i test_mm_mask_cvt2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvt2ph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8128( // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} // CHECK: ret <2 x i64> %{{.*}} - return _mm_mask_cvtne2ph_phf8(__W, __U, __A, __B); + return _mm_mask_cvt2ph_hf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtne2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtne2ph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128( +__m128i test_mm_maskz_cvt2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvt2ph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8128( // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_cvtne2ph_phf8(__U, __A, __B); + return _mm_maskz_cvt2ph_hf8(__U, __A, __B); } -__m256i test_mm256_cvtne2ph_phf8(__m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtne2ph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256( - return _mm256_cvtne2ph_phf8(__A, __B); +__m256i test_mm256_cvt2ph_hf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvt2ph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256( + return _mm256_cvt2ph_hf8(__A, __B); } -__m256i test_mm256_mask_cvtne2ph_phf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtne2ph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256( +__m256i test_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvt2ph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256( // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} // CHECK: ret <4 x i64> %{{.*}} - return _mm256_mask_cvtne2ph_phf8(__W, __U, __A, __B); + return _mm256_mask_cvt2ph_hf8(__W, __U, __A, __B); } -__m256i test_mm256_maskz_cvtne2ph_phf8(__mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtne2ph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256( +__m256i test_mm256_maskz_cvt2ph_hf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvt2ph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256( // CHECK: zeroinitializer // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_cvtne2ph_phf8(__U, __A, __B); + return _mm256_maskz_cvt2ph_hf8(__U, __A, __B); } -__m128i test_mm_cvtnes2ph_phf8(__m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_cvtnes2ph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128( - return _mm_cvtnes2ph_phf8(__A, __B); +__m128i test_mm_cvts2ph_hf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvts2ph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8s128( + return _mm_cvts2ph_hf8(__A, __B); } -__m128i test_mm_mask_cvtnes2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_mask_cvtnes2ph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128( +__m128i test_mm_mask_cvts2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvts2ph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8s128( // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} // CHECK: ret <2 x i64> %{{.*}} - return _mm_mask_cvtnes2ph_phf8(__W, __U, __A, __B); + return _mm_mask_cvts2ph_hf8(__W, __U, __A, __B); } -__m128i test_mm_maskz_cvtnes2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtnes2ph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128( +__m128i test_mm_maskz_cvts2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvts2ph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8s128( // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_cvtnes2ph_phf8(__U, __A, __B); + return _mm_maskz_cvts2ph_hf8(__U, __A, __B); } -__m256i test_mm256_cvtnes2ph_phf8(__m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_cvtnes2ph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256( - return _mm256_cvtnes2ph_phf8(__A, __B); +__m256i test_mm256_cvts2ph_hf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvts2ph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256( + return _mm256_cvts2ph_hf8(__A, __B); } -__m256i test_mm256_mask_cvtnes2ph_phf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_mask_cvtnes2ph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256( +__m256i test_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvts2ph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256( // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} // CHECK: ret <4 x i64> %{{.*}} - return _mm256_mask_cvtnes2ph_phf8(__W, __U, __A, __B); + return _mm256_mask_cvts2ph_hf8(__W, __U, __A, __B); } -__m256i test_mm256_maskz_cvtnes2ph_phf8(__mmask16 __U, __m256h __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtnes2ph_phf8( - // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256( +__m256i test_mm256_maskz_cvts2ph_hf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvts2ph_hf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256( // CHECK: zeroinitializer // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_cvtnes2ph_phf8(__U, __A, __B); + return _mm256_maskz_cvts2ph_hf8(__U, __A, __B); } -__m128h test_mm_cvtnehf8_ph(__m128i __A) { - // CHECK-LABEL: @test_mm_cvtnehf8_ph( +__m128h test_mm_cvthf8(__m128i __A) { + // CHECK-LABEL: @test_mm_cvthf8( // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( - return _mm_cvtnehf8_ph(__A); + return _mm_cvthf8(__A); } -__m128h test_mm_mask_cvtnehf8_ph(__m128h __A, __mmask8 __B, __m128i __C) { - // CHECK-LABEL: @test_mm_mask_cvtnehf8_ph( +__m128h test_mm_mask_cvthf8(__m128h __A, __mmask8 __B, __m128i __C) { + // CHECK-LABEL: @test_mm_mask_cvthf8( // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( - return _mm_mask_cvtnehf8_ph(__A, __B, __C); + return _mm_mask_cvthf8(__A, __B, __C); } -__m128h test_mm_maskz_cvtnehf8_ph(__mmask8 __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_cvtnehf8_ph( +__m128h test_mm_maskz_cvthf8(__mmask8 __A, __m128i __B) { + // CHECK-LABEL: @test_mm_maskz_cvthf8( // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( - return _mm_maskz_cvtnehf8_ph(__A, __B); + return _mm_maskz_cvthf8(__A, __B); } -__m256h test_mm256_cvtnehf8_ph(__m128i __A) { - // CHECK-LABEL: @test_mm256_cvtnehf8_ph( +__m256h test_mm256_cvthf8(__m128i __A) { + // CHECK-LABEL: @test_mm256_cvthf8( // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( - return _mm256_cvtnehf8_ph(__A); + return _mm256_cvthf8(__A); } -__m256h test_mm256_mask_cvtnehf8_ph(__m256h __A, __mmask16 __B, __m128i __C) { - // CHECK-LABEL: @test_mm256_mask_cvtnehf8_ph( +__m256h test_mm256_mask_cvthf8(__m256h __A, __mmask16 __B, __m128i __C) { + // CHECK-LABEL: @test_mm256_mask_cvthf8( // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( - return _mm256_mask_cvtnehf8_ph(__A, __B, __C); + return _mm256_mask_cvthf8(__A, __B, __C); } -__m256h test_mm256_maskz_cvtnehf8_ph(__mmask16 __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtnehf8_ph( +__m256h test_mm256_maskz_cvthf8(__mmask16 __A, __m128i __B) { + // CHECK-LABEL: @test_mm256_maskz_cvthf8( // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( - return _mm256_maskz_cvtnehf8_ph(__A, __B); + return _mm256_maskz_cvthf8(__A, __B); } -__m128i test_mm_cvtneph_pbf8(__m128h __A) { - // CHECK-LABEL: @test_mm_cvtneph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128( - return _mm_cvtneph_pbf8(__A); +__m128i test_mm_cvtph_bf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8128( + return _mm_cvtph_bf8(__A); } -__m128i test_mm_mask_cvtneph_pbf8(__m128i __A, __mmask8 __B, __m128h __C) { - // CHECK-LABEL: @test_mm_mask_cvtneph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128( - return _mm_mask_cvtneph_pbf8(__A, __B, __C); +__m128i test_mm_mask_cvtph_bf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8128( + return _mm_mask_cvtph_bf8(__A, __B, __C); } -__m128i test_mm_maskz_cvtneph_pbf8(__mmask8 __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtneph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128( - return _mm_maskz_cvtneph_pbf8(__A, __B); +__m128i test_mm_maskz_cvtph_bf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8128( + return _mm_maskz_cvtph_bf8(__A, __B); } -__m128i test_mm256_cvtneph_pbf8(__m256h __A) { - // CHECK-LABEL: @test_mm256_cvtneph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256( - return _mm256_cvtneph_pbf8(__A); +__m128i test_mm256_cvtph_bf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8256( + return _mm256_cvtph_bf8(__A); } -__m128i test_mm256_mask_cvtneph_pbf8(__m128i __A, __mmask16 __B, __m256h __C) { - // CHECK-LABEL: @test_mm256_mask_cvtneph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256( - return _mm256_mask_cvtneph_pbf8(__A, __B, __C); +__m128i test_mm256_mask_cvtph_bf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8256( + return _mm256_mask_cvtph_bf8(__A, __B, __C); } -__m128i test_mm256_maskz_cvtneph_pbf8(__mmask16 __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtneph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256( - return _mm256_maskz_cvtneph_pbf8(__A, __B); +__m128i test_mm256_maskz_cvtph_bf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8256( + return _mm256_maskz_cvtph_bf8(__A, __B); } -__m128i test_mm_cvtnesph_pbf8(__m128h __A) { - // CHECK-LABEL: @test_mm_cvtnesph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128( - return _mm_cvtnesph_pbf8(__A); +__m128i test_mm_cvtsph_bf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtsph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s128( + return _mm_cvtsph_bf8(__A); } -__m128i test_mm_mask_cvtnesph_pbf8(__m128i __A, __mmask8 __B, __m128h __C) { - // CHECK-LABEL: @test_mm_mask_cvtnesph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128( - return _mm_mask_cvtnesph_pbf8(__A, __B, __C); +__m128i test_mm_mask_cvtsph_bf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtsph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s128( + return _mm_mask_cvtsph_bf8(__A, __B, __C); } -__m128i test_mm_maskz_cvtnesph_pbf8(__mmask8 __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtnesph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128( - return _mm_maskz_cvtnesph_pbf8(__A, __B); +__m128i test_mm_maskz_cvtsph_bf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtsph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s128( + return _mm_maskz_cvtsph_bf8(__A, __B); } -__m128i test_mm256_cvtnesph_pbf8(__m256h __A) { - // CHECK-LABEL: @test_mm256_cvtnesph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256( - return _mm256_cvtnesph_pbf8(__A); +__m128i test_mm256_cvtsph_bf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtsph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s256( + return _mm256_cvtsph_bf8(__A); } -__m128i test_mm256_mask_cvtnesph_pbf8(__m128i __A, __mmask16 __B, __m256h __C) { - // CHECK-LABEL: @test_mm256_mask_cvtnesph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256( - return _mm256_mask_cvtnesph_pbf8(__A, __B, __C); +__m128i test_mm256_mask_cvtsph_bf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtsph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s256( + return _mm256_mask_cvtsph_bf8(__A, __B, __C); } -__m128i test_mm256_maskz_cvtnesph_pbf8(__mmask16 __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtnesph_pbf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256( - return _mm256_maskz_cvtnesph_pbf8(__A, __B); +__m128i test_mm256_maskz_cvtsph_bf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtsph_bf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s256( + return _mm256_maskz_cvtsph_bf8(__A, __B); } -__m128i test_mm_cvtneph_phf8(__m128h __A) { - // CHECK-LABEL: @test_mm_cvtneph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128( - return _mm_cvtneph_phf8(__A); +__m128i test_mm_cvtph_hf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8128( + return _mm_cvtph_hf8(__A); } -__m128i test_mm_mask_cvtneph_phf8(__m128i __A, __mmask8 __B, __m128h __C) { - // CHECK-LABEL: @test_mm_mask_cvtneph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128( - return _mm_mask_cvtneph_phf8(__A, __B, __C); +__m128i test_mm_mask_cvtph_hf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8128( + return _mm_mask_cvtph_hf8(__A, __B, __C); } -__m128i test_mm_maskz_cvtneph_phf8(__mmask8 __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtneph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128( - return _mm_maskz_cvtneph_phf8(__A, __B); +__m128i test_mm_maskz_cvtph_hf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8128( + return _mm_maskz_cvtph_hf8(__A, __B); } -__m128i test_mm256_cvtneph_phf8(__m256h __A) { - // CHECK-LABEL: @test_mm256_cvtneph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256( - return _mm256_cvtneph_phf8(__A); +__m128i test_mm256_cvtph_hf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8256( + return _mm256_cvtph_hf8(__A); } -__m128i test_mm256_mask_cvtneph_phf8(__m128i __A, __mmask16 __B, __m256h __C) { - // CHECK-LABEL: @test_mm256_mask_cvtneph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256( - return _mm256_mask_cvtneph_phf8(__A, __B, __C); +__m128i test_mm256_mask_cvtph_hf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8256( + return _mm256_mask_cvtph_hf8(__A, __B, __C); } -__m128i test_mm256_maskz_cvtneph_phf8(__mmask16 __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtneph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256( - return _mm256_maskz_cvtneph_phf8(__A, __B); +__m128i test_mm256_maskz_cvtph_hf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8256( + return _mm256_maskz_cvtph_hf8(__A, __B); } -__m128i test_mm_cvtnesph_phf8(__m128h __A) { - // CHECK-LABEL: @test_mm_cvtnesph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128( - return _mm_cvtnesph_phf8(__A); +__m128i test_mm_cvtsph_hf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtsph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s128( + return _mm_cvtsph_hf8(__A); } -__m128i test_mm_mask_cvtnesph_phf8(__m128i __A, __mmask8 __B, __m128h __C) { - // CHECK-LABEL: @test_mm_mask_cvtnesph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128( - return _mm_mask_cvtnesph_phf8(__A, __B, __C); +__m128i test_mm_mask_cvtsph_hf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtsph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s128( + return _mm_mask_cvtsph_hf8(__A, __B, __C); } -__m128i test_mm_maskz_cvtnesph_phf8(__mmask8 __A, __m128h __B) { - // CHECK-LABEL: @test_mm_maskz_cvtnesph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128( - return _mm_maskz_cvtnesph_phf8(__A, __B); +__m128i test_mm_maskz_cvtsph_hf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtsph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s128( + return _mm_maskz_cvtsph_hf8(__A, __B); } -__m128i test_mm256_cvtnesph_phf8(__m256h __A) { - // CHECK-LABEL: @test_mm256_cvtnesph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256( - return _mm256_cvtnesph_phf8(__A); +__m128i test_mm256_cvtsph_hf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtsph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s256( + return _mm256_cvtsph_hf8(__A); } -__m128i test_mm256_mask_cvtnesph_phf8(__m128i __A, __mmask16 __B, __m256h __C) { - // CHECK-LABEL: @test_mm256_mask_cvtnesph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256( - return _mm256_mask_cvtnesph_phf8(__A, __B, __C); +__m128i test_mm256_mask_cvtsph_hf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtsph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s256( + return _mm256_mask_cvtsph_hf8(__A, __B, __C); } -__m128i test_mm256_maskz_cvtnesph_phf8(__mmask16 __A, __m256h __B) { - // CHECK-LABEL: @test_mm256_maskz_cvtnesph_phf8( - // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256( - return _mm256_maskz_cvtnesph_phf8(__A, __B); +__m128i test_mm256_maskz_cvtsph_hf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtsph_hf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s256( + return _mm256_maskz_cvtsph_hf8(__A, __B); } -__m256h test_mm256_cvtpbf8_ph(__m128i A) { - // CHECK-LABEL: @test_mm256_cvtpbf8_ph +__m256h test_mm256_cvtbf8_ph(__m128i A) { + // CHECK-LABEL: @test_mm256_cvtbf8_ph // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> // CHECK: @llvm.x86.avx2.pslli.w // CHECK: ret <16 x half> %{{.*}} - return _mm256_cvtpbf8_ph(A); + return _mm256_cvtbf8_ph(A); } -__m256h test_mm256_mask_cvtpbf8_ph(__m256h S, __mmask16 M, __m128i A) { - // CHECK-LABEL: @test_mm256_mask_cvtpbf8_ph +__m256h test_mm256_mask_cvtbf8_ph(__m256h S, __mmask16 M, __m128i A) { + // CHECK-LABEL: @test_mm256_mask_cvtbf8_ph // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> // CHECK: @llvm.x86.avx2.pslli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} // CHECK: ret <16 x half> %{{.*}} - return _mm256_mask_cvtpbf8_ph(S, M, A); + return _mm256_mask_cvtbf8_ph(S, M, A); } -__m256h test_mm256_maskz_cvtpbf8_ph(__mmask16 M, __m128i A) { - // CHECK-LABEL: @test_mm256_maskz_cvtpbf8_ph +__m256h test_mm256_maskz_cvtbf8_ph(__mmask16 M, __m128i A) { + // CHECK-LABEL: @test_mm256_maskz_cvtbf8_ph // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} // CHECK: @llvm.x86.avx2.pslli.w // CHECK: ret <16 x half> %{{.*}} - return _mm256_maskz_cvtpbf8_ph(M, A); + return _mm256_maskz_cvtbf8_ph(M, A); } -__m128h test_mm_cvtpbf8_ph(__m128i A) { - // CHECK-LABEL: @test_mm_cvtpbf8_ph +__m128h test_mm_cvtbf8_ph(__m128i A) { + // CHECK-LABEL: @test_mm_cvtbf8_ph // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> // CHECK: @llvm.x86.sse2.pslli.w // CHECK: ret <8 x half> %{{.*}} - return _mm_cvtpbf8_ph(A); + return _mm_cvtbf8_ph(A); } -__m128h test_mm_mask_cvtpbf8_ph(__m128h S, __mmask8 M, __m128i A) { - // CHECK-LABEL: @test_mm_mask_cvtpbf8_ph +__m128h test_mm_mask_cvtbf8_ph(__m128h S, __mmask8 M, __m128i A) { + // CHECK-LABEL: @test_mm_mask_cvtbf8_ph // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> // CHECK: @llvm.x86.sse2.pslli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} // CHECK: ret <8 x half> %{{.*}} - return _mm_mask_cvtpbf8_ph(S, M, A); + return _mm_mask_cvtbf8_ph(S, M, A); } -__m128h test_mm_maskz_cvtpbf8_ph(__mmask8 M, __m128i A) { - // CHECK-LABEL: @test_mm_maskz_cvtpbf8_ph +__m128h test_mm_maskz_cvtbf8_ph(__mmask8 M, __m128i A) { + // CHECK-LABEL: @test_mm_maskz_cvtbf8_ph // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} // CHECK: @llvm.x86.sse2.pslli.w // CHECK: ret <8 x half> %{{.*}} - return _mm_maskz_cvtpbf8_ph(M, A); + return _mm_maskz_cvtbf8_ph(M, A); } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 9b924bfea448b3..facc0f9fc8df55 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -7503,40 +7503,40 @@ def int_x86_avx10_mask_vcvtbiasph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvtbia def int_x86_avx10_mask_vcvtbiasph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2hf8s_512_mask">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v64i8_ty, llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2bf8128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8_128">, +def int_x86_avx10_vcvt2ph2bf8128 : ClangBuiltin<"__builtin_ia32_vcvt2ph2bf8_128">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2bf8256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8_256">, +def int_x86_avx10_vcvt2ph2bf8256 : ClangBuiltin<"__builtin_ia32_vcvt2ph2bf8_256">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2bf8512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8_512">, +def int_x86_avx10_vcvt2ph2bf8512 : ClangBuiltin<"__builtin_ia32_vcvt2ph2bf8_512">, DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2bf8s128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8s_128">, +def int_x86_avx10_vcvt2ph2bf8s128 : ClangBuiltin<"__builtin_ia32_vcvt2ph2bf8s_128">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2bf8s256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8s_256">, +def int_x86_avx10_vcvt2ph2bf8s256 : ClangBuiltin<"__builtin_ia32_vcvt2ph2bf8s_256">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2bf8s512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8s_512">, +def int_x86_avx10_vcvt2ph2bf8s512 : ClangBuiltin<"__builtin_ia32_vcvt2ph2bf8s_512">, DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2hf8128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8_128">, +def int_x86_avx10_vcvt2ph2hf8128 : ClangBuiltin<"__builtin_ia32_vcvt2ph2hf8_128">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2hf8256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8_256">, +def int_x86_avx10_vcvt2ph2hf8256 : ClangBuiltin<"__builtin_ia32_vcvt2ph2hf8_256">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2hf8512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8_512">, +def int_x86_avx10_vcvt2ph2hf8512 : ClangBuiltin<"__builtin_ia32_vcvt2ph2hf8_512">, DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2hf8s128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8s_128">, +def int_x86_avx10_vcvt2ph2hf8s128 : ClangBuiltin<"__builtin_ia32_vcvt2ph2hf8s_128">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8s_256">, +def int_x86_avx10_vcvt2ph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvt2ph2hf8s_256">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], [IntrNoMem]>; -def int_x86_avx10_vcvtne2ph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8s_512">, +def int_x86_avx10_vcvt2ph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvt2ph2hf8s_512">, DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], [IntrNoMem]>; def int_x86_avx10_mask_vcvthf82ph128 : ClangBuiltin<"__builtin_ia32_vcvthf8_2ph128_mask">, @@ -7548,40 +7548,40 @@ def int_x86_avx10_mask_vcvthf82ph256 : ClangBuiltin<"__builtin_ia32_vcvthf8_2ph2 def int_x86_avx10_mask_vcvthf82ph512 : ClangBuiltin<"__builtin_ia32_vcvthf8_2ph512_mask">, DefaultAttrsIntrinsic<[llvm_v32f16_ty], [llvm_v32i8_ty, llvm_v32f16_ty, llvm_i32_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2bf8128 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8_128_mask">, +def int_x86_avx10_mask_vcvtph2bf8128 : ClangBuiltin<"__builtin_ia32_vcvtph2bf8_128_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2bf8256 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8_256_mask">, +def int_x86_avx10_mask_vcvtph2bf8256 : ClangBuiltin<"__builtin_ia32_vcvtph2bf8_256_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2bf8512 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8_512_mask">, +def int_x86_avx10_mask_vcvtph2bf8512 : ClangBuiltin<"__builtin_ia32_vcvtph2bf8_512_mask">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2bf8s128 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8s_128_mask">, +def int_x86_avx10_mask_vcvtph2bf8s128 : ClangBuiltin<"__builtin_ia32_vcvtph2bf8s_128_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2bf8s256 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8s_256_mask">, +def int_x86_avx10_mask_vcvtph2bf8s256 : ClangBuiltin<"__builtin_ia32_vcvtph2bf8s_256_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2bf8s512 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8s_512_mask">, +def int_x86_avx10_mask_vcvtph2bf8s512 : ClangBuiltin<"__builtin_ia32_vcvtph2bf8s_512_mask">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2hf8128 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8_128_mask">, +def int_x86_avx10_mask_vcvtph2hf8128 : ClangBuiltin<"__builtin_ia32_vcvtph2hf8_128_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2hf8256 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8_256_mask">, +def int_x86_avx10_mask_vcvtph2hf8256 : ClangBuiltin<"__builtin_ia32_vcvtph2hf8_256_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2hf8512 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8_512_mask">, +def int_x86_avx10_mask_vcvtph2hf8512 : ClangBuiltin<"__builtin_ia32_vcvtph2hf8_512_mask">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2hf8s128 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8s_128_mask">, +def int_x86_avx10_mask_vcvtph2hf8s128 : ClangBuiltin<"__builtin_ia32_vcvtph2hf8s_128_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8s_256_mask">, +def int_x86_avx10_mask_vcvtph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvtph2hf8s_256_mask">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; -def int_x86_avx10_mask_vcvtneph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8s_512_mask">, +def int_x86_avx10_mask_vcvtph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtph2hf8s_512_mask">, DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7a9be6f2af9b40..fad329e348af80 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34953,26 +34953,26 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVTTP2IUBS) NODE_NAME_CASE(CVTTP2IBS_SAE) NODE_NAME_CASE(CVTTP2IUBS_SAE) - NODE_NAME_CASE(VCVTNE2PH2BF8) - NODE_NAME_CASE(VCVTNE2PH2BF8S) - NODE_NAME_CASE(VCVTNE2PH2HF8) - NODE_NAME_CASE(VCVTNE2PH2HF8S) + NODE_NAME_CASE(VCVT2PH2BF8) + NODE_NAME_CASE(VCVT2PH2BF8S) + NODE_NAME_CASE(VCVT2PH2HF8) + NODE_NAME_CASE(VCVT2PH2HF8S) NODE_NAME_CASE(VCVTBIASPH2BF8) NODE_NAME_CASE(VCVTBIASPH2BF8S) NODE_NAME_CASE(VCVTBIASPH2HF8) NODE_NAME_CASE(VCVTBIASPH2HF8S) - NODE_NAME_CASE(VCVTNEPH2BF8) - NODE_NAME_CASE(VCVTNEPH2BF8S) - NODE_NAME_CASE(VCVTNEPH2HF8) - NODE_NAME_CASE(VCVTNEPH2HF8S) + NODE_NAME_CASE(VCVTPH2BF8) + NODE_NAME_CASE(VCVTPH2BF8S) + NODE_NAME_CASE(VCVTPH2HF8) + NODE_NAME_CASE(VCVTPH2HF8S) NODE_NAME_CASE(VMCVTBIASPH2BF8) NODE_NAME_CASE(VMCVTBIASPH2BF8S) NODE_NAME_CASE(VMCVTBIASPH2HF8) NODE_NAME_CASE(VMCVTBIASPH2HF8S) - NODE_NAME_CASE(VMCVTNEPH2BF8) - NODE_NAME_CASE(VMCVTNEPH2BF8S) - NODE_NAME_CASE(VMCVTNEPH2HF8) - NODE_NAME_CASE(VMCVTNEPH2HF8S) + NODE_NAME_CASE(VMCVTPH2BF8) + NODE_NAME_CASE(VMCVTPH2BF8S) + NODE_NAME_CASE(VMCVTPH2HF8) + NODE_NAME_CASE(VMCVTPH2HF8S) NODE_NAME_CASE(VCVTHF82PH) NODE_NAME_CASE(AESENC128KL) NODE_NAME_CASE(AESDEC128KL) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 03f10a3c83e30c..e07bcd989c5188 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -625,26 +625,26 @@ namespace llvm { MPSADBW, - VCVTNE2PH2BF8, - VCVTNE2PH2BF8S, - VCVTNE2PH2HF8, - VCVTNE2PH2HF8S, + VCVT2PH2BF8, + VCVT2PH2BF8S, + VCVT2PH2HF8, + VCVT2PH2HF8S, VCVTBIASPH2BF8, VCVTBIASPH2BF8S, VCVTBIASPH2HF8, VCVTBIASPH2HF8S, - VCVTNEPH2BF8, - VCVTNEPH2BF8S, - VCVTNEPH2HF8, - VCVTNEPH2HF8S, + VCVTPH2BF8, + VCVTPH2BF8S, + VCVTPH2HF8, + VCVTPH2HF8S, VMCVTBIASPH2BF8, VMCVTBIASPH2BF8S, VMCVTBIASPH2HF8, VMCVTBIASPH2HF8S, - VMCVTNEPH2BF8, - VMCVTNEPH2BF8S, - VMCVTNEPH2HF8, - VMCVTNEPH2HF8S, + VMCVTPH2BF8, + VMCVTPH2BF8S, + VMCVTPH2HF8, + VMCVTPH2HF8S, VCVTHF82PH, // Compress and expand. diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 98874a3c9e6f20..c5ce8f20c065e7 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -1072,21 +1072,21 @@ defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx", avx512vl_f32_info, avx512vl_f16_info, X86vfpround2, X86vfpround2Rnd>, T8; -defm VCVTNE2PH2BF8 : avx512_binop_all<0x74, "vcvtne2ph2bf8", SchedWriteCvtPD2PS, +defm VCVT2PH2BF8 : avx512_binop_all<0x74, "vcvt2ph2bf8", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvtne2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T8, XD; -defm VCVTNE2PH2BF8S : avx512_binop_all<0x74, "vcvtne2ph2bf8s", SchedWriteCvtPD2PS, +defm VCVT2PH2BF8S : avx512_binop_all<0x74, "vcvt2ph2bf8s", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvtne2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; -defm VCVTNE2PH2HF8 : avx512_binop_all<0x18, "vcvtne2ph2hf8", SchedWriteCvtPD2PS, +defm VCVT2PH2HF8 : avx512_binop_all<0x18, "vcvt2ph2hf8", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvtne2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; -defm VCVTNE2PH2HF8S : avx512_binop_all<0x1b, "vcvtne2ph2hf8s", SchedWriteCvtPD2PS, +defm VCVT2PH2HF8S : avx512_binop_all<0x1b, "vcvt2ph2hf8s", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvtne2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; //TODO: Merge into avx512_vcvt_fp, diffrence is one more source register here. @@ -1244,27 +1244,27 @@ defm VCVTBIASPH2HF8S : avx10_convert_3op<0x1b, "vcvtbiasph2hf8s", X86vcvtbiasph2hf8s, X86vmcvtbiasph2hf8s>, T_MAP5, PS; -defm VCVTNEPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtneph2bf8", avx512vl_i8_info, +defm VCVTPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, - X86vcvtneph2bf8, X86vmcvtneph2bf8, + X86vcvtph2bf8, X86vmcvtph2bf8, [HasAVX10_2], [HasAVX10_2_512]>, T8, XS, EVEX_CD8<16, CD8VF>; -defm VCVTNEPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtneph2bf8s", avx512vl_i8_info, +defm VCVTPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8s", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, - X86vcvtneph2bf8s, X86vmcvtneph2bf8s, + X86vcvtph2bf8s, X86vmcvtph2bf8s, [HasAVX10_2], [HasAVX10_2_512]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; -defm VCVTNEPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtneph2hf8", avx512vl_i8_info, +defm VCVTPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtph2hf8", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, - X86vcvtneph2hf8, X86vmcvtneph2hf8, + X86vcvtph2hf8, X86vmcvtph2hf8, [HasAVX10_2], [HasAVX10_2_512]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; -defm VCVTNEPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtneph2hf8s", avx512vl_i8_info, +defm VCVTPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtph2hf8s", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, - X86vcvtneph2hf8s, X86vmcvtneph2hf8s, + X86vcvtph2hf8s, X86vmcvtph2hf8s, [HasAVX10_2], [HasAVX10_2_512]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; @@ -1332,7 +1332,7 @@ multiclass avx10_fp_binopne_int_pbf16 opc, string OpcodeStr, } } -multiclass avx10_fp_binop_pbf16 opc, string OpcodeStr, SDPatternOperator OpNode, +multiclass avx10_fp_binop_bf16 opc, string OpcodeStr, SDPatternOperator OpNode, X86SchedWriteSizes sched, bit IsCommutable = 0, SDPatternOperator MaskOpNode = OpNode> { @@ -1351,10 +1351,10 @@ multiclass avx10_fp_binop_pbf16 opc, string OpcodeStr, SDPatternOperator } let Uses = [], mayRaiseFPException = 0 in { -defm VADDNEPBF16 : avx10_fp_binop_pbf16<0x58, "vaddne", fadd, SchedWriteFAddSizes, 1>; -defm VSUBNEPBF16 : avx10_fp_binop_pbf16<0x5C, "vsubne", fsub, SchedWriteFAddSizes, 0>; -defm VMULNEPBF16 : avx10_fp_binop_pbf16<0x59, "vmulne", fmul, SchedWriteFMulSizes, 1>; -defm VDIVNEPBF16 : avx10_fp_binop_pbf16<0x5E, "vdivne", fdiv, SchedWriteFDivSizes, 0>; +defm VADDNEPBF16 : avx10_fp_binop_bf16<0x58, "vaddne", fadd, SchedWriteFAddSizes, 1>; +defm VSUBNEPBF16 : avx10_fp_binop_bf16<0x5C, "vsubne", fsub, SchedWriteFAddSizes, 0>; +defm VMULNEPBF16 : avx10_fp_binop_bf16<0x59, "vmulne", fmul, SchedWriteFMulSizes, 1>; +defm VDIVNEPBF16 : avx10_fp_binop_bf16<0x5E, "vdivne", fdiv, SchedWriteFDivSizes, 0>; defm VMINPBF16 : avx10_fp_binopne_int_pbf16<0x5D, "vmin", SchedWriteFCmpSizes, 0>; defm VMAXPBF16 : avx10_fp_binopne_int_pbf16<0x5F, "vmax", SchedWriteFCmpSizes, 0>; } diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index af0267a7d32c3a..de70570481fc2b 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -914,13 +914,13 @@ def X86vfpround2Rnd : SDNode<"X86ISD::VFPROUND2_RND", SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>>; // 3op -def X86vcvtne2ph2bf8 : SDNode<"X86ISD::VCVTNE2PH2BF8", +def X86vcvt2ph2bf8 : SDNode<"X86ISD::VCVT2PH2BF8", SDTAVX10CONVERT_I82F16>; -def X86vcvtne2ph2bf8s : SDNode<"X86ISD::VCVTNE2PH2BF8S", +def X86vcvt2ph2bf8s : SDNode<"X86ISD::VCVT2PH2BF8S", SDTAVX10CONVERT_I82F16>; -def X86vcvtne2ph2hf8 : SDNode<"X86ISD::VCVTNE2PH2HF8", +def X86vcvt2ph2hf8 : SDNode<"X86ISD::VCVT2PH2HF8", SDTAVX10CONVERT_I82F16>; -def X86vcvtne2ph2hf8s : SDNode<"X86ISD::VCVTNE2PH2HF8S", +def X86vcvt2ph2hf8s : SDNode<"X86ISD::VCVT2PH2HF8S", SDTAVX10CONVERT_I82F16>; // 2op no broadcast def X86vcvthf82ph : SDNode<"X86ISD::VCVTHF82PH", @@ -934,13 +934,13 @@ def X86vcvtbiasph2hf8 : SDNode<"X86ISD::VCVTBIASPH2HF8", SDTAVX10CONVERT_2I8F16>; def X86vcvtbiasph2hf8s : SDNode<"X86ISD::VCVTBIASPH2HF8S", SDTAVX10CONVERT_2I8F16>; -def X86vcvtneph2bf8 : SDNode<"X86ISD::VCVTNEPH2BF8", +def X86vcvtph2bf8 : SDNode<"X86ISD::VCVTPH2BF8", SDTAVX10CONVERT_I8F16>; -def X86vcvtneph2bf8s : SDNode<"X86ISD::VCVTNEPH2BF8S", +def X86vcvtph2bf8s : SDNode<"X86ISD::VCVTPH2BF8S", SDTAVX10CONVERT_I8F16>; -def X86vcvtneph2hf8 : SDNode<"X86ISD::VCVTNEPH2HF8", +def X86vcvtph2hf8 : SDNode<"X86ISD::VCVTPH2HF8", SDTAVX10CONVERT_I8F16>; -def X86vcvtneph2hf8s : SDNode<"X86ISD::VCVTNEPH2HF8S", +def X86vcvtph2hf8s : SDNode<"X86ISD::VCVTPH2HF8S", SDTAVX10CONVERT_I8F16>; def X86vmcvtbiasph2bf8 : SDNode<"X86ISD::VMCVTBIASPH2BF8", @@ -951,13 +951,13 @@ def X86vmcvtbiasph2hf8 : SDNode<"X86ISD::VMCVTBIASPH2HF8", SDTAVX10CONVERT_2I8F16_MASK>; def X86vmcvtbiasph2hf8s : SDNode<"X86ISD::VMCVTBIASPH2HF8S", SDTAVX10CONVERT_2I8F16_MASK>; -def X86vmcvtneph2bf8 : SDNode<"X86ISD::VMCVTNEPH2BF8", +def X86vmcvtph2bf8 : SDNode<"X86ISD::VMCVTPH2BF8", SDTAVX10CONVERT_I8F16_MASK>; -def X86vmcvtneph2bf8s : SDNode<"X86ISD::VMCVTNEPH2BF8S", +def X86vmcvtph2bf8s : SDNode<"X86ISD::VMCVTPH2BF8S", SDTAVX10CONVERT_I8F16_MASK>; -def X86vmcvtneph2hf8 : SDNode<"X86ISD::VMCVTNEPH2HF8", +def X86vmcvtph2hf8 : SDNode<"X86ISD::VMCVTPH2HF8", SDTAVX10CONVERT_I8F16_MASK>; -def X86vmcvtneph2hf8s : SDNode<"X86ISD::VMCVTNEPH2HF8S", +def X86vmcvtph2hf8s : SDNode<"X86ISD::VMCVTPH2HF8S", SDTAVX10CONVERT_I8F16_MASK>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 467c7026bceb91..9e5db04fc0a4dd 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -93,7 +93,7 @@ struct IntrinsicData { }; #define X86_INTRINSIC_DATA(id, type, op0, op1) \ - { Intrinsic::x86_##id, type, op0, op1 } + {Intrinsic::x86_##id, type, op0, op1} /* * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in @@ -479,30 +479,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VCVTHF82PH, 0), X86_INTRINSIC_DATA(avx10_mask_vcvthf82ph512, INTR_TYPE_1OP_MASK, X86ISD::VCVTHF82PH, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8128, TRUNCATE_TO_REG, - X86ISD::VCVTNEPH2BF8, X86ISD::VMCVTNEPH2BF8), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8256, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2BF8, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8512, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2BF8, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8s128, TRUNCATE_TO_REG, - X86ISD::VCVTNEPH2BF8S, X86ISD::VMCVTNEPH2BF8S), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8s256, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2BF8S, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8s512, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2BF8S, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8128, TRUNCATE_TO_REG, - X86ISD::VCVTNEPH2HF8, X86ISD::VMCVTNEPH2HF8), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8256, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2HF8, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8512, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2HF8, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8s128, TRUNCATE_TO_REG, - X86ISD::VCVTNEPH2HF8S, X86ISD::VMCVTNEPH2HF8S), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8s256, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2HF8S, 0), - X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8s512, INTR_TYPE_1OP_MASK, - X86ISD::VCVTNEPH2HF8S, 0), X86_INTRINSIC_DATA(avx10_mask_vcvtpd2dq256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx10_mask_vcvtpd2ph256, INTR_TYPE_1OP_MASK, @@ -515,8 +491,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), X86_INTRINSIC_DATA(avx10_mask_vcvtpd2uqq256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2bf8128, TRUNCATE_TO_REG, + X86ISD::VCVTPH2BF8, X86ISD::VMCVTPH2BF8), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2bf8256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2BF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2bf8512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2BF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2bf8s128, TRUNCATE_TO_REG, + X86ISD::VCVTPH2BF8S, X86ISD::VMCVTPH2BF8S), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2bf8s256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2bf8s512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2BF8S, 0), X86_INTRINSIC_DATA(avx10_mask_vcvtph2dq256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2hf8128, TRUNCATE_TO_REG, + X86ISD::VCVTPH2HF8, X86ISD::VMCVTPH2HF8), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2hf8256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2HF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2hf8512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2HF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2hf8s128, TRUNCATE_TO_REG, + X86ISD::VCVTPH2HF8S, X86ISD::VMCVTPH2HF8S), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2hf8s256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2hf8s512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTPH2HF8S, 0), X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs128, INTR_TYPE_1OP_MASK, X86ISD::CVTP2IBS, 0), X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs256, INTR_TYPE_1OP_MASK, @@ -757,6 +757,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx10_vcomsbf16le, COMI, X86ISD::COMI, ISD::SETLE), X86_INTRINSIC_DATA(avx10_vcomsbf16lt, COMI, X86ISD::COMI, ISD::SETLT), X86_INTRINSIC_DATA(avx10_vcomsbf16neq, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(avx10_vcvt2ph2bf8128, INTR_TYPE_2OP, X86ISD::VCVT2PH2BF8, + 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2bf8256, INTR_TYPE_2OP, X86ISD::VCVT2PH2BF8, + 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2bf8512, INTR_TYPE_2OP, X86ISD::VCVT2PH2BF8, + 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2bf8s128, INTR_TYPE_2OP, + X86ISD::VCVT2PH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2bf8s256, INTR_TYPE_2OP, + X86ISD::VCVT2PH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2bf8s512, INTR_TYPE_2OP, + X86ISD::VCVT2PH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2hf8128, INTR_TYPE_2OP, X86ISD::VCVT2PH2HF8, + 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2hf8256, INTR_TYPE_2OP, X86ISD::VCVT2PH2HF8, + 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2hf8512, INTR_TYPE_2OP, X86ISD::VCVT2PH2HF8, + 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2hf8s128, INTR_TYPE_2OP, + X86ISD::VCVT2PH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2hf8s256, INTR_TYPE_2OP, + X86ISD::VCVT2PH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvt2ph2hf8s512, INTR_TYPE_2OP, + X86ISD::VCVT2PH2HF8S, 0), X86_INTRINSIC_DATA(avx10_vcvtbf162ibs128, INTR_TYPE_1OP, X86ISD::CVTP2IBS, 0), X86_INTRINSIC_DATA(avx10_vcvtbf162ibs256, INTR_TYPE_1OP, X86ISD::CVTP2IBS, @@ -769,30 +793,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { 0), X86_INTRINSIC_DATA(avx10_vcvtbf162iubs512, INTR_TYPE_1OP, X86ISD::CVTP2IUBS, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8128, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2BF8, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8256, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2BF8, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8512, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2BF8, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8s128, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2BF8S, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8s256, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2BF8S, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8s512, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2BF8S, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8128, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2HF8, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8256, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2HF8, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8512, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2HF8, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8s128, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2HF8S, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8s256, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2HF8S, 0), - X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8s512, INTR_TYPE_2OP, - X86ISD::VCVTNE2PH2HF8S, 0), X86_INTRINSIC_DATA(avx10_vcvttbf162ibs128, INTR_TYPE_1OP, X86ISD::CVTTP2IBS, 0), X86_INTRINSIC_DATA(avx10_vcvttbf162ibs256, INTR_TYPE_1OP, X86ISD::CVTTP2IBS, diff --git a/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll index e755b56f30d4c0..c4a904cc3bc416 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll @@ -258,28 +258,28 @@ define <32 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8s512(<64 x i8> %A, <32 ret <32 x i8> %ret } -define <64 x i8> @test_int_x86_avx10_vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512: +define <64 x i8> @test_int_x86_avx10_vcvt2ph2bf8512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2bf8512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2bf8 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0x74,0xc1] +; CHECK-NEXT: vcvt2ph2bf8 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0x74,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) + %ret = call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8512(<32 x half> %A, <32 x half> %B) ret <64 x i8> %ret } -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_mask: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2bf8512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8512_mask: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8512_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8512(<32 x half> %A, <32 x half> %B) %2 = bitcast <8 x i64> %C to <64 x i8> %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 @@ -287,51 +287,51 @@ define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8512_mask(<8 x i64> %C, i64 %U, ret <8 x i64> %5 } -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_maskz: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2bf8512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8512_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x74,0xc1] +; X64-NEXT: vcvt2ph2bf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8512_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x74,0xc1] +; X86-NEXT: vcvt2ph2bf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8512(<32 x half> %A, <32 x half> %B) %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer %5 = bitcast <64 x i8> %4 to <8 x i64> ret <8 x i64> %5 } -declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) +declare <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8512(<32 x half> %A, <32 x half> %B) -define <64 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512: +define <64 x i8> @test_int_x86_avx10_vcvt2ph2bf8s512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2bf8s512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2bf8s %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x74,0xc1] +; CHECK-NEXT: vcvt2ph2bf8s %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x74,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) + %ret = call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8s512(<32 x half> %A, <32 x half> %B) ret <64 x i8> %ret } -declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) +declare <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8s512(<32 x half> %A, <32 x half> %B) -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8s512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_mask: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2bf8s512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8s512_mask: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8s512_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8s512(<32 x half> %A, <32 x half> %B) %2 = bitcast <8 x i64> %C to <64 x i8> %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 @@ -339,47 +339,47 @@ define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8s512_mask(<8 x i64> %C, i64 %U ret <8 x i64> %5 } -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8s512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_maskz: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2bf8s512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8s512_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x74,0xc1] +; X64-NEXT: vcvt2ph2bf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8s512_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x74,0xc1] +; X86-NEXT: vcvt2ph2bf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2bf8s512(<32 x half> %A, <32 x half> %B) %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer %5 = bitcast <64 x i8> %4 to <8 x i64> ret <8 x i64> %5 } -define <64 x i8> @test_int_x86_avx10_vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512: +define <64 x i8> @test_int_x86_avx10_vcvt2ph2hf8512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2hf8512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2hf8 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x18,0xc1] +; CHECK-NEXT: vcvt2ph2hf8 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x18,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) + %ret = call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8512(<32 x half> %A, <32 x half> %B) ret <64 x i8> %ret } -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_mask: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2hf8512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8512_mask: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x18,0xc2] +; X64-NEXT: vcvt2ph2hf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x18,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8512_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x18,0xc2] +; X86-NEXT: vcvt2ph2hf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x18,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8512(<32 x half> %A, <32 x half> %B) %2 = bitcast <8 x i64> %C to <64 x i8> %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 @@ -387,49 +387,49 @@ define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8512_mask(<8 x i64> %C, i64 %U, ret <8 x i64> %5 } -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_maskz: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2hf8512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8512_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x18,0xc1] +; X64-NEXT: vcvt2ph2hf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x18,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8512_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x18,0xc1] +; X86-NEXT: vcvt2ph2hf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x18,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8512(<32 x half> %A, <32 x half> %B) %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer %5 = bitcast <64 x i8> %4 to <8 x i64> ret <8 x i64> %5 } -declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) +declare <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8512(<32 x half> %A, <32 x half> %B) -define <64 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512: +define <64 x i8> @test_int_x86_avx10_vcvt2ph2hf8s512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2hf8s512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2hf8s %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x1b,0xc1] +; CHECK-NEXT: vcvt2ph2hf8s %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x1b,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) + %ret = call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8s512(<32 x half> %A, <32 x half> %B) ret <64 x i8> %ret } -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8s512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_mask: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2hf8s512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8s512_mask: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x1b,0xc2] +; X64-NEXT: vcvt2ph2hf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x1b,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8s512_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x1b,0xc2] +; X86-NEXT: vcvt2ph2hf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x1b,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8s512(<32 x half> %A, <32 x half> %B) %2 = bitcast <8 x i64> %C to <64 x i8> %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 @@ -438,26 +438,26 @@ define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8s512_mask(<8 x i64> %C, i64 %U } -define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8s512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_maskz: +define <8 x i64> @test_int_x86_avx10_vcvt2ph2hf8s512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8s512_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1b,0xc1] +; X64-NEXT: vcvt2ph2hf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1b,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8s512_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1b,0xc1] +; X86-NEXT: vcvt2ph2hf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1b,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) + %1 = call <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8s512(<32 x half> %A, <32 x half> %B) %3 = bitcast i64 %U to <64 x i1> %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer %5 = bitcast <64 x i8> %4 to <8 x i64> ret <8 x i64> %5 } -declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) +declare <64 x i8> @llvm.x86.avx10.vcvt2ph2hf8s512(<32 x half> %A, <32 x half> %B) define <32 x half> @test_int_x86_avx10_vcvthf82ph512(<32 x i8> %A) nounwind { ; CHECK-LABEL: test_int_x86_avx10_vcvthf82ph512: @@ -504,174 +504,174 @@ define <32 x half> @test_int_x86_avx10_maskz_vcvthf82ph512(<32 x i8> %A, i32 %B) ret <32 x half> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtneph2bf8512(<32 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8512: +define <32 x i8> @test_int_x86_avx10_vcvtph2bf8512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2bf8512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2bf8 %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x74,0xc0] +; CHECK-NEXT: vcvtph2bf8 %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x74,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> undef, i32 -1) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8512(<32 x half> %A, <32 x i8> undef, i32 -1) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8512: +define <32 x i8> @test_int_x86_avx10_mask_vcvtph2bf8512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2bf8512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x74,0xc1] +; X64-NEXT: vcvtph2bf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8512: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2bf8512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x74,0xc1] +; X86-NEXT: vcvtph2bf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> %B, i32 %C) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8512(<32 x half> %A, <32 x i8> %B, i32 %C) ret <32 x i8> %ret } -declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> %B, i32 %C) +declare <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8512(<32 x half> %A, <32 x i8> %B, i32 %C) -define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8512(<32 x half> %A, i32 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8512: +define <32 x i8> @test_int_x86_avx10_maskz_vcvtph2bf8512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x74,0xc0] +; X64-NEXT: vcvtph2bf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x74,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8512: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x74,0xc0] +; X86-NEXT: vcvtph2bf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x74,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtneph2bf8s512(<32 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8s512: +define <32 x i8> @test_int_x86_avx10_vcvtph2bf8s512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2bf8s512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2bf8s %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x74,0xc0] +; CHECK-NEXT: vcvtph2bf8s %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x74,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> undef, i32 -1) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s512(<32 x half> %A, <32 x i8> undef, i32 -1) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8s512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s512: +define <32 x i8> @test_int_x86_avx10_mask_vcvtph2bf8s512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2bf8s512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x74,0xc1] +; X64-NEXT: vcvtph2bf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s512: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2bf8s512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x74,0xc1] +; X86-NEXT: vcvtph2bf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) ret <32 x i8> %ret } -declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) +declare <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) -define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8s512(<32 x half> %A, i32 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s512: +define <32 x i8> @test_int_x86_avx10_maskz_vcvtph2bf8s512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8s512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x74,0xc0] +; X64-NEXT: vcvtph2bf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x74,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s512: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8s512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x74,0xc0] +; X86-NEXT: vcvtph2bf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x74,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtneph2hf8512(<32 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8512: +define <32 x i8> @test_int_x86_avx10_vcvtph2hf8512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2hf8512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2hf8 %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x18,0xc0] +; CHECK-NEXT: vcvtph2hf8 %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x18,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> undef, i32 -1) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8512(<32 x half> %A, <32 x i8> undef, i32 -1) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8512: +define <32 x i8> @test_int_x86_avx10_mask_vcvtph2hf8512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2hf8512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x18,0xc1] +; X64-NEXT: vcvtph2hf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x18,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8512: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2hf8512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x18,0xc1] +; X86-NEXT: vcvtph2hf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x18,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> %B, i32 %C) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8512(<32 x half> %A, <32 x i8> %B, i32 %C) ret <32 x i8> %ret } -declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> %B, i32 %C) +declare <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8512(<32 x half> %A, <32 x i8> %B, i32 %C) -define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8512(<32 x half> %A, i32 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8512: +define <32 x i8> @test_int_x86_avx10_maskz_vcvtph2hf8512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x18,0xc0] +; X64-NEXT: vcvtph2hf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x18,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8512: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x18,0xc0] +; X86-NEXT: vcvtph2hf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x18,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtneph2hf8s512(<32 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8s512: +define <32 x i8> @test_int_x86_avx10_vcvtph2hf8s512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2hf8s512: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2hf8s %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x1b,0xc0] +; CHECK-NEXT: vcvtph2hf8s %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x1b,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> undef, i32 -1) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s512(<32 x half> %A, <32 x i8> undef, i32 -1) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8s512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s512: +define <32 x i8> @test_int_x86_avx10_mask_vcvtph2hf8s512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2hf8s512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x1b,0xc1] +; X64-NEXT: vcvtph2hf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x1b,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s512: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2hf8s512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x1b,0xc1] +; X86-NEXT: vcvtph2hf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x1b,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) ret <32 x i8> %ret } -declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) +declare <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) -define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8s512(<32 x half> %A, i32 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s512: +define <32 x i8> @test_int_x86_avx10_maskz_vcvtph2hf8s512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8s512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x1b,0xc0] +; X64-NEXT: vcvtph2hf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x1b,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s512: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8s512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x1b,0xc0] +; X86-NEXT: vcvtph2hf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x1b,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) ret <32 x i8> %ret } diff --git a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll index fc74f0b490cd85..fe2bfb7b446914 100644 --- a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll @@ -493,381 +493,381 @@ define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8s256(<32 x i8> %A, <16 ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2bf8128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2bf8128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2bf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x74,0xc1] +; CHECK-NEXT: vcvt2ph2bf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x74,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) + %ret = call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8128(<8 x half> %A, <8 x half> %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_mask: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2bf8128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8128_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8128_mask: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C ret <16 x i8> %3 } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8128_maskz(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_maskz: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2bf8128_maskz(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8128_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8128_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer ret <16 x i8> %3 } -declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) +declare <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8128(<8 x half> %A, <8 x half> %B) -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2bf8256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2bf8256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2bf8 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x74,0xc1] +; CHECK-NEXT: vcvt2ph2bf8 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x74,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) + %ret = call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256(<16 x half> %A, <16 x half> %B) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_mask: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2bf8256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8256_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8256_mask: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C ret <32 x i8> %3 } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8256_maskz(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_maskz: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2bf8256_maskz(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8256_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8256_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer ret <32 x i8> %3 } -declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) +declare <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256(<16 x half> %A, <16 x half> %B) -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2bf8s128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2bf8s128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2bf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x74,0xc1] +; CHECK-NEXT: vcvt2ph2bf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x74,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) + %ret = call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8s128(<8 x half> %A, <8 x half> %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_mask: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2bf8s128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8s128_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8s128_mask: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8s128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C ret <16 x i8> %3 } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_maskz: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2bf8s128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8s128_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x74,0xc1] +; X64-NEXT: vcvt2ph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8s128_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x74,0xc1] +; X86-NEXT: vcvt2ph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8s128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer ret <16 x i8> %3 } -declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) +declare <16 x i8> @llvm.x86.avx10.vcvt2ph2bf8s128(<8 x half> %A, <8 x half> %B) -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2bf8s256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2bf8s256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2bf8s %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x74,0xc1] +; CHECK-NEXT: vcvt2ph2bf8s %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x74,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) + %ret = call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256(<16 x half> %A, <16 x half> %B) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_mask: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2bf8s256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8s256_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x74,0xc2] +; X64-NEXT: vcvt2ph2bf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x74,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8s256_mask: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x74,0xc2] +; X86-NEXT: vcvt2ph2bf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x74,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C ret <32 x i8> %3 } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_maskz: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2bf8s256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2bf8s256_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2bf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x74,0xc1] +; X64-NEXT: vcvt2ph2bf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2bf8s256_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2bf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x74,0xc1] +; X86-NEXT: vcvt2ph2bf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer ret <32 x i8> %3 } -declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) +declare <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256(<16 x half> %A, <16 x half> %B) -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2hf8128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2hf8128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2hf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x18,0xc1] +; CHECK-NEXT: vcvt2ph2hf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x18,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) + %ret = call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8128(<8 x half> %A, <8 x half> %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_mask: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2hf8128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8128_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x18,0xc2] +; X64-NEXT: vcvt2ph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x18,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8128_mask: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x18,0xc2] +; X86-NEXT: vcvt2ph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x18,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C ret <16 x i8> %3 } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_maskz: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2hf8128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8128_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x18,0xc1] +; X64-NEXT: vcvt2ph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x18,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8128_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x18,0xc1] +; X86-NEXT: vcvt2ph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x18,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer ret <16 x i8> %3 } -declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) +declare <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8128(<8 x half> %A, <8 x half> %B) -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2hf8256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2hf8256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2hf8 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x18,0xc1] +; CHECK-NEXT: vcvt2ph2hf8 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x18,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) + %ret = call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256(<16 x half> %A, <16 x half> %B) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_mask: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2hf8256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8256_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x18,0xc2] +; X64-NEXT: vcvt2ph2hf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x18,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8256_mask: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x18,0xc2] +; X86-NEXT: vcvt2ph2hf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x18,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C ret <32 x i8> %3 } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_maskz: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2hf8256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8256_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x18,0xc1] +; X64-NEXT: vcvt2ph2hf8 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x18,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8256_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x18,0xc1] +; X86-NEXT: vcvt2ph2hf8 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x18,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer ret <32 x i8> %3 } -declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) +declare <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256(<16 x half> %A, <16 x half> %B) -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2hf8s128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2hf8s128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2hf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x1b,0xc1] +; CHECK-NEXT: vcvt2ph2hf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x1b,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) + %ret = call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8s128(<8 x half> %A, <8 x half> %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_mask: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2hf8s128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8s128_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x1b,0xc2] +; X64-NEXT: vcvt2ph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x1b,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8s128_mask: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x1b,0xc2] +; X86-NEXT: vcvt2ph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x1b,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8s128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C ret <16 x i8> %3 } -define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_maskz: +define <16 x i8> @test_int_x86_avx10_vcvt2ph2hf8s128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8s128_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1b,0xc1] +; X64-NEXT: vcvt2ph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1b,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8s128_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1b,0xc1] +; X86-NEXT: vcvt2ph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1b,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) + %1 = call <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8s128(<8 x half> %A, <8 x half> %B) %2 = bitcast i16 %U to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer ret <16 x i8> %3 } -declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) +declare <16 x i8> @llvm.x86.avx10.vcvt2ph2hf8s128(<8 x half> %A, <8 x half> %B) -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2hf8s256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ph2hf8s256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtne2ph2hf8s %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x1b,0xc1] +; CHECK-NEXT: vcvt2ph2hf8s %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x1b,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) + %ret = call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256(<16 x half> %A, <16 x half> %B) ret <32 x i8> %ret } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_mask: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2hf8s256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8s256_mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x1b,0xc2] +; X64-NEXT: vcvt2ph2hf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x1b,0xc2] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_mask: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8s256_mask: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x1b,0xc2] +; X86-NEXT: vcvt2ph2hf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x1b,0xc2] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C ret <32 x i8> %3 } -define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { -; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_maskz: +define <32 x i8> @test_int_x86_avx10_vcvt2ph2hf8s256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvt2ph2hf8s256_maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ph2hf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1b,0xc1] +; X64-NEXT: vcvt2ph2hf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1b,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_maskz: +; X86-LABEL: test_int_x86_avx10_vcvt2ph2hf8s256_maskz: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ph2hf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1b,0xc1] +; X86-NEXT: vcvt2ph2hf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1b,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) + %1 = call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256(<16 x half> %A, <16 x half> %B) %2 = bitcast i32 %U to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer ret <32 x i8> %3 } -declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) +declare <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256(<16 x half> %A, <16 x half> %B) define <8 x half> @test_int_x86_avx10_vcvthf82ph128(<16 x i8> %A) nounwind { ; CHECK-LABEL: test_int_x86_avx10_vcvthf82ph128: @@ -959,366 +959,366 @@ define <16 x half> @test_int_x86_avx10_maskz_vcvthf82ph256(<16 x i8> %A, i16 %B) ret <16 x half> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8128(<8 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8128: +define <16 x i8> @test_int_x86_avx10_vcvtph2bf8128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2bf8128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2bf8 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x74,0xc0] +; CHECK-NEXT: vcvtph2bf8 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x74,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> undef, i8 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8128(<8 x half> %A, <16 x i8> undef, i8 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8128: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2bf8128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2bf8128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x74,0xc1] +; X64-NEXT: vcvtph2bf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8128: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2bf8128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x74,0xc1] +; X86-NEXT: vcvtph2bf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> %B, i8 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8128(<8 x half> %A, <16 x i8> %B, i8 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> %B, i8 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8128(<8 x half> %A, <16 x i8> %B, i8 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8128(<8 x half> %A, i8 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8128: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2bf8128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x74,0xc0] +; X64-NEXT: vcvtph2bf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x74,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8128: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x74,0xc0] +; X86-NEXT: vcvtph2bf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x74,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8256(<16 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8256: +define <16 x i8> @test_int_x86_avx10_vcvtph2bf8256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2bf8256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2bf8 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x74,0xc0] +; CHECK-NEXT: vcvtph2bf8 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x74,0xc0] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> undef, i16 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8256(<16 x half> %A, <16 x i8> undef, i16 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8256: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2bf8256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2bf8256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x74,0xc1] +; X64-NEXT: vcvtph2bf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x74,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8256: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2bf8256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x74,0xc1] +; X86-NEXT: vcvtph2bf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x74,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> %B, i16 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8256(<16 x half> %A, <16 x i8> %B, i16 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> %B, i16 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8256(<16 x half> %A, <16 x i8> %B, i16 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8256(<16 x half> %A, i16 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8256: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2bf8256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x74,0xc0] +; X64-NEXT: vcvtph2bf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x74,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8256: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x74,0xc0] +; X86-NEXT: vcvtph2bf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x74,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8s128(<8 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8s128: +define <16 x i8> @test_int_x86_avx10_vcvtph2bf8s128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2bf8s128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2bf8s %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x74,0xc0] +; CHECK-NEXT: vcvtph2bf8s %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x74,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> undef, i8 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s128(<8 x half> %A, <16 x i8> undef, i8 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8s128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s128: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2bf8s128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2bf8s128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x74,0xc1] +; X64-NEXT: vcvtph2bf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x74,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s128: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2bf8s128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x74,0xc1] +; X86-NEXT: vcvtph2bf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x74,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8s128(<8 x half> %A, i8 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s128: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2bf8s128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8s128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x74,0xc0] +; X64-NEXT: vcvtph2bf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x74,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s128: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8s128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x74,0xc0] +; X86-NEXT: vcvtph2bf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x74,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8s256(<16 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8s256: +define <16 x i8> @test_int_x86_avx10_vcvtph2bf8s256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2bf8s256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2bf8s %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x74,0xc0] +; CHECK-NEXT: vcvtph2bf8s %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x74,0xc0] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> undef, i16 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s256(<16 x half> %A, <16 x i8> undef, i16 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8s256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s256: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2bf8s256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2bf8s256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x74,0xc1] +; X64-NEXT: vcvtph2bf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x74,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s256: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2bf8s256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x74,0xc1] +; X86-NEXT: vcvtph2bf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x74,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8s256(<16 x half> %A, i16 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s256: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2bf8s256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8s256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2bf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x74,0xc0] +; X64-NEXT: vcvtph2bf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x74,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s256: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2bf8s256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2bf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x74,0xc0] +; X86-NEXT: vcvtph2bf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x74,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2bf8s256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8128(<8 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8128: +define <16 x i8> @test_int_x86_avx10_vcvtph2hf8128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2hf8128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2hf8 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x18,0xc0] +; CHECK-NEXT: vcvtph2hf8 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x18,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> undef, i8 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8128(<8 x half> %A, <16 x i8> undef, i8 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8128: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2hf8128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2hf8128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x18,0xc1] +; X64-NEXT: vcvtph2hf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x18,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8128: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2hf8128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x18,0xc1] +; X86-NEXT: vcvtph2hf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x18,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> %B, i8 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8128(<8 x half> %A, <16 x i8> %B, i8 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> %B, i8 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8128(<8 x half> %A, <16 x i8> %B, i8 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8128(<8 x half> %A, i8 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8128: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2hf8128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x18,0xc0] +; X64-NEXT: vcvtph2hf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x18,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8128: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x18,0xc0] +; X86-NEXT: vcvtph2hf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x18,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8256(<16 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8256: +define <16 x i8> @test_int_x86_avx10_vcvtph2hf8256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2hf8256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2hf8 %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x18,0xc0] +; CHECK-NEXT: vcvtph2hf8 %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x18,0xc0] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> undef, i16 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8256(<16 x half> %A, <16 x i8> undef, i16 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8256: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2hf8256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2hf8256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x18,0xc1] +; X64-NEXT: vcvtph2hf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x18,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8256: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2hf8256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x18,0xc1] +; X86-NEXT: vcvtph2hf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x18,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> %B, i16 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8256(<16 x half> %A, <16 x i8> %B, i16 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> %B, i16 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8256(<16 x half> %A, <16 x i8> %B, i16 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8256(<16 x half> %A, i16 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8256: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2hf8256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x18,0xc0] +; X64-NEXT: vcvtph2hf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x18,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8256: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x18,0xc0] +; X86-NEXT: vcvtph2hf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x18,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8s128(<8 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8s128: +define <16 x i8> @test_int_x86_avx10_vcvtph2hf8s128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2hf8s128: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2hf8s %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x1b,0xc0] +; CHECK-NEXT: vcvtph2hf8s %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x1b,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> undef, i8 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s128(<8 x half> %A, <16 x i8> undef, i8 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8s128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s128: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2hf8s128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2hf8s128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x1b,0xc1] +; X64-NEXT: vcvtph2hf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x1b,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s128: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2hf8s128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x1b,0xc1] +; X86-NEXT: vcvtph2hf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x1b,0xc1] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8s128(<8 x half> %A, i8 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s128: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2hf8s128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8s128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x1b,0xc0] +; X64-NEXT: vcvtph2hf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x1b,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s128: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8s128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x1b,0xc0] +; X86-NEXT: vcvtph2hf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x1b,0xc0] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8s256(<16 x half> %A) nounwind { -; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8s256: +define <16 x i8> @test_int_x86_avx10_vcvtph2hf8s256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtph2hf8s256: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtneph2hf8s %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x1b,0xc0] +; CHECK-NEXT: vcvtph2hf8s %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x1b,0xc0] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> undef, i16 -1) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s256(<16 x half> %A, <16 x i8> undef, i16 -1) ret <16 x i8> %ret } -define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8s256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { -; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s256: +define <16 x i8> @test_int_x86_avx10_mask_vcvtph2hf8s256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtph2hf8s256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x1b,0xc1] +; X64-NEXT: vcvtph2hf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x1b,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s256: +; X86-LABEL: test_int_x86_avx10_mask_vcvtph2hf8s256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x1b,0xc1] +; X86-NEXT: vcvtph2hf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x1b,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) ret <16 x i8> %ret } -declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) +declare <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) -define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8s256(<16 x half> %A, i16 %B) nounwind { -; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s256: +define <16 x i8> @test_int_x86_avx10_maskz_vcvtph2hf8s256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8s256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneph2hf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x1b,0xc0] +; X64-NEXT: vcvtph2hf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x1b,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] ; -; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s256: +; X86-LABEL: test_int_x86_avx10_maskz_vcvtph2hf8s256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneph2hf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x1b,0xc0] +; X86-NEXT: vcvtph2hf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x1b,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] - %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtph2hf8s256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) ret <16 x i8> %ret } diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt index 71506201cffe83..3b66fa1da52759 100644 --- a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt +++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt @@ -657,835 +657,835 @@ # INTEL: vcvthf82ph zmm2 {k7} {z}, ymmword ptr [edx - 4096] 0x62,0xf5,0x7f,0xcf,0x1e,0x52,0x80 -# ATT: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8 ymm2, ymm3, ymm4 +# ATT: vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8 ymm2, ymm3, ymm4 0x62,0xf2,0x67,0x28,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymm4 +# ATT: vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2bf8 ymm2 {k7}, ymm3, ymm4 0x62,0xf2,0x67,0x2f,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 +# ATT: vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 0x62,0xf2,0x67,0xaf,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8 zmm2, zmm3, zmm4 +# ATT: vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8 zmm2, zmm3, zmm4 0x62,0xf2,0x67,0x48,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmm4 +# ATT: vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2bf8 zmm2 {k7}, zmm3, zmm4 0x62,0xf2,0x67,0x4f,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 +# ATT: vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 0x62,0xf2,0x67,0xcf,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8 xmm2, xmm3, xmm4 +# ATT: vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8 xmm2, xmm3, xmm4 0x62,0xf2,0x67,0x08,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmm4 +# ATT: vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2bf8 xmm2 {k7}, xmm3, xmm4 0x62,0xf2,0x67,0x0f,0x74,0xd4 -# ATT: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 +# ATT: vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 0x62,0xf2,0x67,0x8f,0x74,0xd4 -# ATT: vcvtne2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf2,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] 0x62,0xf2,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} +# ATT: vcvt2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} 0x62,0xf2,0x67,0x58,0x74,0x10 -# ATT: vcvtne2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +# ATT: vcvt2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] 0x62,0xf2,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +# ATT: vcvt2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] 0x62,0xf2,0x67,0xcf,0x74,0x51,0x7f -# ATT: vcvtne2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +# ATT: vcvt2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} 0x62,0xf2,0x67,0xdf,0x74,0x52,0x80 -# ATT: vcvtne2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] 0x62,0xf2,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] 0x62,0xf2,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} +# ATT: vcvt2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} 0x62,0xf2,0x67,0x38,0x74,0x10 -# ATT: vcvtne2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +# ATT: vcvt2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] 0x62,0xf2,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +# ATT: vcvt2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] 0x62,0xf2,0x67,0xaf,0x74,0x51,0x7f -# ATT: vcvtne2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +# ATT: vcvt2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} 0x62,0xf2,0x67,0xbf,0x74,0x52,0x80 -# ATT: vcvtne2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf2,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] 0x62,0xf2,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} +# ATT: vcvt2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} 0x62,0xf2,0x67,0x18,0x74,0x10 -# ATT: vcvtne2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +# ATT: vcvt2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] 0x62,0xf2,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +# ATT: vcvt2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] 0x62,0xf2,0x67,0x8f,0x74,0x51,0x7f -# ATT: vcvtne2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +# ATT: vcvt2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} 0x62,0xf2,0x67,0x9f,0x74,0x52,0x80 -# ATT: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8s ymm2, ymm3, ymm4 +# ATT: vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8s ymm2, ymm3, ymm4 0x62,0xf5,0x67,0x28,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymm4 +# ATT: vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2bf8s ymm2 {k7}, ymm3, ymm4 0x62,0xf5,0x67,0x2f,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 +# ATT: vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 0x62,0xf5,0x67,0xaf,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8s zmm2, zmm3, zmm4 +# ATT: vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8s zmm2, zmm3, zmm4 0x62,0xf5,0x67,0x48,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmm4 +# ATT: vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2bf8s zmm2 {k7}, zmm3, zmm4 0x62,0xf5,0x67,0x4f,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 +# ATT: vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 0x62,0xf5,0x67,0xcf,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8s xmm2, xmm3, xmm4 +# ATT: vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8s xmm2, xmm3, xmm4 0x62,0xf5,0x67,0x08,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmm4 +# ATT: vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2bf8s xmm2 {k7}, xmm3, xmm4 0x62,0xf5,0x67,0x0f,0x74,0xd4 -# ATT: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 +# ATT: vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 0x62,0xf5,0x67,0x8f,0x74,0xd4 -# ATT: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} +# ATT: vcvt2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} 0x62,0xf5,0x67,0x58,0x74,0x10 -# ATT: vcvtne2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 -# INTEL: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +# ATT: vcvt2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvt2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] 0x62,0xf5,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +# ATT: vcvt2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] 0x62,0xf5,0x67,0xcf,0x74,0x51,0x7f -# ATT: vcvtne2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +# ATT: vcvt2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} 0x62,0xf5,0x67,0xdf,0x74,0x52,0x80 -# ATT: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} +# ATT: vcvt2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} 0x62,0xf5,0x67,0x38,0x74,0x10 -# ATT: vcvtne2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 -# INTEL: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +# ATT: vcvt2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvt2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] 0x62,0xf5,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +# ATT: vcvt2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] 0x62,0xf5,0x67,0xaf,0x74,0x51,0x7f -# ATT: vcvtne2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +# ATT: vcvt2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} 0x62,0xf5,0x67,0xbf,0x74,0x52,0x80 -# ATT: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} +# ATT: vcvt2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} 0x62,0xf5,0x67,0x18,0x74,0x10 -# ATT: vcvtne2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 -# INTEL: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +# ATT: vcvt2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvt2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] 0x62,0xf5,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +# ATT: vcvt2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] 0x62,0xf5,0x67,0x8f,0x74,0x51,0x7f -# ATT: vcvtne2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +# ATT: vcvt2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} 0x62,0xf5,0x67,0x9f,0x74,0x52,0x80 -# ATT: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8 ymm2, ymm3, ymm4 +# ATT: vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8 ymm2, ymm3, ymm4 0x62,0xf5,0x67,0x28,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymm4 +# ATT: vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2hf8 ymm2 {k7}, ymm3, ymm4 0x62,0xf5,0x67,0x2f,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 +# ATT: vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 0x62,0xf5,0x67,0xaf,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8 zmm2, zmm3, zmm4 +# ATT: vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8 zmm2, zmm3, zmm4 0x62,0xf5,0x67,0x48,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmm4 +# ATT: vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2hf8 zmm2 {k7}, zmm3, zmm4 0x62,0xf5,0x67,0x4f,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 +# ATT: vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 0x62,0xf5,0x67,0xcf,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8 xmm2, xmm3, xmm4 +# ATT: vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8 xmm2, xmm3, xmm4 0x62,0xf5,0x67,0x08,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmm4 +# ATT: vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2hf8 xmm2 {k7}, xmm3, xmm4 0x62,0xf5,0x67,0x0f,0x18,0xd4 -# ATT: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 +# ATT: vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 0x62,0xf5,0x67,0x8f,0x18,0xd4 -# ATT: vcvtne2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} +# ATT: vcvt2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} 0x62,0xf5,0x67,0x58,0x18,0x10 -# ATT: vcvtne2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +# ATT: vcvt2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] 0x62,0xf5,0x67,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +# ATT: vcvt2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] 0x62,0xf5,0x67,0xcf,0x18,0x51,0x7f -# ATT: vcvtne2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +# ATT: vcvt2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} 0x62,0xf5,0x67,0xdf,0x18,0x52,0x80 -# ATT: vcvtne2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} +# ATT: vcvt2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} 0x62,0xf5,0x67,0x38,0x18,0x10 -# ATT: vcvtne2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +# ATT: vcvt2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] 0x62,0xf5,0x67,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +# ATT: vcvt2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] 0x62,0xf5,0x67,0xaf,0x18,0x51,0x7f -# ATT: vcvtne2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +# ATT: vcvt2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} 0x62,0xf5,0x67,0xbf,0x18,0x52,0x80 -# ATT: vcvtne2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} +# ATT: vcvt2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} 0x62,0xf5,0x67,0x18,0x18,0x10 -# ATT: vcvtne2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +# ATT: vcvt2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] 0x62,0xf5,0x67,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +# ATT: vcvt2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] 0x62,0xf5,0x67,0x8f,0x18,0x51,0x7f -# ATT: vcvtne2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +# ATT: vcvt2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} 0x62,0xf5,0x67,0x9f,0x18,0x52,0x80 -# ATT: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8s ymm2, ymm3, ymm4 +# ATT: vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8s ymm2, ymm3, ymm4 0x62,0xf5,0x67,0x28,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymm4 +# ATT: vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2hf8s ymm2 {k7}, ymm3, ymm4 0x62,0xf5,0x67,0x2f,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 +# ATT: vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 0x62,0xf5,0x67,0xaf,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8s zmm2, zmm3, zmm4 +# ATT: vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8s zmm2, zmm3, zmm4 0x62,0xf5,0x67,0x48,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmm4 +# ATT: vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2hf8s zmm2 {k7}, zmm3, zmm4 0x62,0xf5,0x67,0x4f,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 +# ATT: vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 0x62,0xf5,0x67,0xcf,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8s xmm2, xmm3, xmm4 +# ATT: vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8s xmm2, xmm3, xmm4 0x62,0xf5,0x67,0x08,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmm4 +# ATT: vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2hf8s xmm2 {k7}, xmm3, xmm4 0x62,0xf5,0x67,0x0f,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 +# ATT: vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 0x62,0xf5,0x67,0x8f,0x1b,0xd4 -# ATT: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -# INTEL: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} +# ATT: vcvt2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} 0x62,0xf5,0x67,0x58,0x1b,0x10 -# ATT: vcvtne2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 -# INTEL: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +# ATT: vcvt2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvt2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] 0x62,0xf5,0x67,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +# ATT: vcvt2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] 0x62,0xf5,0x67,0xcf,0x1b,0x51,0x7f -# ATT: vcvtne2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +# ATT: vcvt2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} 0x62,0xf5,0x67,0xdf,0x1b,0x52,0x80 -# ATT: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -# INTEL: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} +# ATT: vcvt2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} 0x62,0xf5,0x67,0x38,0x1b,0x10 -# ATT: vcvtne2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 -# INTEL: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +# ATT: vcvt2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvt2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] 0x62,0xf5,0x67,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +# ATT: vcvt2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] 0x62,0xf5,0x67,0xaf,0x1b,0x51,0x7f -# ATT: vcvtne2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +# ATT: vcvt2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} 0x62,0xf5,0x67,0xbf,0x1b,0x52,0x80 -# ATT: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvt2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x67,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -# INTEL: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvt2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x67,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} +# ATT: vcvt2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} 0x62,0xf5,0x67,0x18,0x1b,0x10 -# ATT: vcvtne2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 -# INTEL: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +# ATT: vcvt2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvt2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] 0x62,0xf5,0x67,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +# ATT: vcvt2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] 0x62,0xf5,0x67,0x8f,0x1b,0x51,0x7f -# ATT: vcvtne2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +# ATT: vcvt2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} 0x62,0xf5,0x67,0x9f,0x1b,0x52,0x80 -# ATT: vcvtneph2bf8 %xmm3, %xmm2 -# INTEL: vcvtneph2bf8 xmm2, xmm3 +# ATT: vcvtph2bf8 %xmm3, %xmm2 +# INTEL: vcvtph2bf8 xmm2, xmm3 0x62,0xf2,0x7e,0x08,0x74,0xd3 -# ATT: vcvtneph2bf8 %xmm3, %xmm2 {%k7} -# INTEL: vcvtneph2bf8 xmm2 {k7}, xmm3 +# ATT: vcvtph2bf8 %xmm3, %xmm2 {%k7} +# INTEL: vcvtph2bf8 xmm2 {k7}, xmm3 0x62,0xf2,0x7e,0x0f,0x74,0xd3 -# ATT: vcvtneph2bf8 %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, xmm3 +# ATT: vcvtph2bf8 %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8 xmm2 {k7} {z}, xmm3 0x62,0xf2,0x7e,0x8f,0x74,0xd3 -# ATT: vcvtneph2bf8 %zmm3, %ymm2 -# INTEL: vcvtneph2bf8 ymm2, zmm3 +# ATT: vcvtph2bf8 %zmm3, %ymm2 +# INTEL: vcvtph2bf8 ymm2, zmm3 0x62,0xf2,0x7e,0x48,0x74,0xd3 -# ATT: vcvtneph2bf8 %zmm3, %ymm2 {%k7} -# INTEL: vcvtneph2bf8 ymm2 {k7}, zmm3 +# ATT: vcvtph2bf8 %zmm3, %ymm2 {%k7} +# INTEL: vcvtph2bf8 ymm2 {k7}, zmm3 0x62,0xf2,0x7e,0x4f,0x74,0xd3 -# ATT: vcvtneph2bf8 %zmm3, %ymm2 {%k7} {z} -# INTEL: vcvtneph2bf8 ymm2 {k7} {z}, zmm3 +# ATT: vcvtph2bf8 %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtph2bf8 ymm2 {k7} {z}, zmm3 0x62,0xf2,0x7e,0xcf,0x74,0xd3 -# ATT: vcvtneph2bf8 %ymm3, %xmm2 -# INTEL: vcvtneph2bf8 xmm2, ymm3 +# ATT: vcvtph2bf8 %ymm3, %xmm2 +# INTEL: vcvtph2bf8 xmm2, ymm3 0x62,0xf2,0x7e,0x28,0x74,0xd3 -# ATT: vcvtneph2bf8 %ymm3, %xmm2 {%k7} -# INTEL: vcvtneph2bf8 xmm2 {k7}, ymm3 +# ATT: vcvtph2bf8 %ymm3, %xmm2 {%k7} +# INTEL: vcvtph2bf8 xmm2 {k7}, ymm3 0x62,0xf2,0x7e,0x2f,0x74,0xd3 -# ATT: vcvtneph2bf8 %ymm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, ymm3 +# ATT: vcvtph2bf8 %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8 xmm2 {k7} {z}, ymm3 0x62,0xf2,0x7e,0xaf,0x74,0xd3 -# ATT: vcvtneph2bf8x 268435456(%esp,%esi,8), %xmm2 -# INTEL: vcvtneph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2bf8x 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf2,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} -# INTEL: vcvtneph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] 0x62,0xf2,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8 (%eax){1to8}, %xmm2 -# INTEL: vcvtneph2bf8 xmm2, word ptr [eax]{1to8} +# ATT: vcvtph2bf8 (%eax){1to8}, %xmm2 +# INTEL: vcvtph2bf8 xmm2, word ptr [eax]{1to8} 0x62,0xf2,0x7e,0x18,0x74,0x10 -# ATT: vcvtneph2bf8x -512(,%ebp,2), %xmm2 -# INTEL: vcvtneph2bf8 xmm2, xmmword ptr [2*ebp - 512] +# ATT: vcvtph2bf8x -512(,%ebp,2), %xmm2 +# INTEL: vcvtph2bf8 xmm2, xmmword ptr [2*ebp - 512] 0x62,0xf2,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2bf8x 2032(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +# ATT: vcvtph2bf8x 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] 0x62,0xf2,0x7e,0x8f,0x74,0x51,0x7f -# ATT: vcvtneph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +# ATT: vcvtph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} 0x62,0xf2,0x7e,0x9f,0x74,0x52,0x80 -# ATT: vcvtneph2bf8 (%eax){1to16}, %xmm2 -# INTEL: vcvtneph2bf8 xmm2, word ptr [eax]{1to16} +# ATT: vcvtph2bf8 (%eax){1to16}, %xmm2 +# INTEL: vcvtph2bf8 xmm2, word ptr [eax]{1to16} 0x62,0xf2,0x7e,0x38,0x74,0x10 -# ATT: vcvtneph2bf8y -1024(,%ebp,2), %xmm2 -# INTEL: vcvtneph2bf8 xmm2, ymmword ptr [2*ebp - 1024] +# ATT: vcvtph2bf8y -1024(,%ebp,2), %xmm2 +# INTEL: vcvtph2bf8 xmm2, ymmword ptr [2*ebp - 1024] 0x62,0xf2,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2bf8y 4064(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +# ATT: vcvtph2bf8y 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] 0x62,0xf2,0x7e,0xaf,0x74,0x51,0x7f -# ATT: vcvtneph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +# ATT: vcvtph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} 0x62,0xf2,0x7e,0xbf,0x74,0x52,0x80 -# ATT: vcvtneph2bf8 268435456(%esp,%esi,8), %ymm2 -# INTEL: vcvtneph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2bf8 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf2,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8 291(%edi,%eax,4), %ymm2 {%k7} -# INTEL: vcvtneph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2bf8 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] 0x62,0xf2,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8 (%eax){1to32}, %ymm2 -# INTEL: vcvtneph2bf8 ymm2, word ptr [eax]{1to32} +# ATT: vcvtph2bf8 (%eax){1to32}, %ymm2 +# INTEL: vcvtph2bf8 ymm2, word ptr [eax]{1to32} 0x62,0xf2,0x7e,0x58,0x74,0x10 -# ATT: vcvtneph2bf8 -2048(,%ebp,2), %ymm2 -# INTEL: vcvtneph2bf8 ymm2, zmmword ptr [2*ebp - 2048] +# ATT: vcvtph2bf8 -2048(,%ebp,2), %ymm2 +# INTEL: vcvtph2bf8 ymm2, zmmword ptr [2*ebp - 2048] 0x62,0xf2,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2bf8 8128(%ecx), %ymm2 {%k7} {z} -# INTEL: vcvtneph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +# ATT: vcvtph2bf8 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] 0x62,0xf2,0x7e,0xcf,0x74,0x51,0x7f -# ATT: vcvtneph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} -# INTEL: vcvtneph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +# ATT: vcvtph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} 0x62,0xf2,0x7e,0xdf,0x74,0x52,0x80 -# ATT: vcvtneph2bf8s %xmm3, %xmm2 -# INTEL: vcvtneph2bf8s xmm2, xmm3 +# ATT: vcvtph2bf8s %xmm3, %xmm2 +# INTEL: vcvtph2bf8s xmm2, xmm3 0x62,0xf5,0x7e,0x08,0x74,0xd3 -# ATT: vcvtneph2bf8s %xmm3, %xmm2 {%k7} -# INTEL: vcvtneph2bf8s xmm2 {k7}, xmm3 +# ATT: vcvtph2bf8s %xmm3, %xmm2 {%k7} +# INTEL: vcvtph2bf8s xmm2 {k7}, xmm3 0x62,0xf5,0x7e,0x0f,0x74,0xd3 -# ATT: vcvtneph2bf8s %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, xmm3 +# ATT: vcvtph2bf8s %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8s xmm2 {k7} {z}, xmm3 0x62,0xf5,0x7e,0x8f,0x74,0xd3 -# ATT: vcvtneph2bf8s %zmm3, %ymm2 -# INTEL: vcvtneph2bf8s ymm2, zmm3 +# ATT: vcvtph2bf8s %zmm3, %ymm2 +# INTEL: vcvtph2bf8s ymm2, zmm3 0x62,0xf5,0x7e,0x48,0x74,0xd3 -# ATT: vcvtneph2bf8s %zmm3, %ymm2 {%k7} -# INTEL: vcvtneph2bf8s ymm2 {k7}, zmm3 +# ATT: vcvtph2bf8s %zmm3, %ymm2 {%k7} +# INTEL: vcvtph2bf8s ymm2 {k7}, zmm3 0x62,0xf5,0x7e,0x4f,0x74,0xd3 -# ATT: vcvtneph2bf8s %zmm3, %ymm2 {%k7} {z} -# INTEL: vcvtneph2bf8s ymm2 {k7} {z}, zmm3 +# ATT: vcvtph2bf8s %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtph2bf8s ymm2 {k7} {z}, zmm3 0x62,0xf5,0x7e,0xcf,0x74,0xd3 -# ATT: vcvtneph2bf8s %ymm3, %xmm2 -# INTEL: vcvtneph2bf8s xmm2, ymm3 +# ATT: vcvtph2bf8s %ymm3, %xmm2 +# INTEL: vcvtph2bf8s xmm2, ymm3 0x62,0xf5,0x7e,0x28,0x74,0xd3 -# ATT: vcvtneph2bf8s %ymm3, %xmm2 {%k7} -# INTEL: vcvtneph2bf8s xmm2 {k7}, ymm3 +# ATT: vcvtph2bf8s %ymm3, %xmm2 {%k7} +# INTEL: vcvtph2bf8s xmm2 {k7}, ymm3 0x62,0xf5,0x7e,0x2f,0x74,0xd3 -# ATT: vcvtneph2bf8s %ymm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, ymm3 +# ATT: vcvtph2bf8s %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8s xmm2 {k7} {z}, ymm3 0x62,0xf5,0x7e,0xaf,0x74,0xd3 -# ATT: vcvtneph2bf8sx 268435456(%esp,%esi,8), %xmm2 -# INTEL: vcvtneph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2bf8sx 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} -# INTEL: vcvtneph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8s (%eax){1to8}, %xmm2 -# INTEL: vcvtneph2bf8s xmm2, word ptr [eax]{1to8} +# ATT: vcvtph2bf8s (%eax){1to8}, %xmm2 +# INTEL: vcvtph2bf8s xmm2, word ptr [eax]{1to8} 0x62,0xf5,0x7e,0x18,0x74,0x10 -# ATT: vcvtneph2bf8sx -512(,%ebp,2), %xmm2 -# INTEL: vcvtneph2bf8s xmm2, xmmword ptr [2*ebp - 512] +# ATT: vcvtph2bf8sx -512(,%ebp,2), %xmm2 +# INTEL: vcvtph2bf8s xmm2, xmmword ptr [2*ebp - 512] 0x62,0xf5,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +# ATT: vcvtph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] 0x62,0xf5,0x7e,0x8f,0x74,0x51,0x7f -# ATT: vcvtneph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +# ATT: vcvtph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} 0x62,0xf5,0x7e,0x9f,0x74,0x52,0x80 -# ATT: vcvtneph2bf8s (%eax){1to16}, %xmm2 -# INTEL: vcvtneph2bf8s xmm2, word ptr [eax]{1to16} +# ATT: vcvtph2bf8s (%eax){1to16}, %xmm2 +# INTEL: vcvtph2bf8s xmm2, word ptr [eax]{1to16} 0x62,0xf5,0x7e,0x38,0x74,0x10 -# ATT: vcvtneph2bf8sy -1024(,%ebp,2), %xmm2 -# INTEL: vcvtneph2bf8s xmm2, ymmword ptr [2*ebp - 1024] +# ATT: vcvtph2bf8sy -1024(,%ebp,2), %xmm2 +# INTEL: vcvtph2bf8s xmm2, ymmword ptr [2*ebp - 1024] 0x62,0xf5,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +# ATT: vcvtph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] 0x62,0xf5,0x7e,0xaf,0x74,0x51,0x7f -# ATT: vcvtneph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +# ATT: vcvtph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} 0x62,0xf5,0x7e,0xbf,0x74,0x52,0x80 -# ATT: vcvtneph2bf8s 268435456(%esp,%esi,8), %ymm2 -# INTEL: vcvtneph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2bf8s 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} -# INTEL: vcvtneph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8s (%eax){1to32}, %ymm2 -# INTEL: vcvtneph2bf8s ymm2, word ptr [eax]{1to32} +# ATT: vcvtph2bf8s (%eax){1to32}, %ymm2 +# INTEL: vcvtph2bf8s ymm2, word ptr [eax]{1to32} 0x62,0xf5,0x7e,0x58,0x74,0x10 -# ATT: vcvtneph2bf8s -2048(,%ebp,2), %ymm2 -# INTEL: vcvtneph2bf8s ymm2, zmmword ptr [2*ebp - 2048] +# ATT: vcvtph2bf8s -2048(,%ebp,2), %ymm2 +# INTEL: vcvtph2bf8s ymm2, zmmword ptr [2*ebp - 2048] 0x62,0xf5,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2bf8s 8128(%ecx), %ymm2 {%k7} {z} -# INTEL: vcvtneph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +# ATT: vcvtph2bf8s 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] 0x62,0xf5,0x7e,0xcf,0x74,0x51,0x7f -# ATT: vcvtneph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} -# INTEL: vcvtneph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +# ATT: vcvtph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} 0x62,0xf5,0x7e,0xdf,0x74,0x52,0x80 -# ATT: vcvtneph2hf8 %xmm3, %xmm2 -# INTEL: vcvtneph2hf8 xmm2, xmm3 +# ATT: vcvtph2hf8 %xmm3, %xmm2 +# INTEL: vcvtph2hf8 xmm2, xmm3 0x62,0xf5,0x7e,0x08,0x18,0xd3 -# ATT: vcvtneph2hf8 %xmm3, %xmm2 {%k7} -# INTEL: vcvtneph2hf8 xmm2 {k7}, xmm3 +# ATT: vcvtph2hf8 %xmm3, %xmm2 {%k7} +# INTEL: vcvtph2hf8 xmm2 {k7}, xmm3 0x62,0xf5,0x7e,0x0f,0x18,0xd3 -# ATT: vcvtneph2hf8 %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, xmm3 +# ATT: vcvtph2hf8 %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8 xmm2 {k7} {z}, xmm3 0x62,0xf5,0x7e,0x8f,0x18,0xd3 -# ATT: vcvtneph2hf8 %zmm3, %ymm2 -# INTEL: vcvtneph2hf8 ymm2, zmm3 +# ATT: vcvtph2hf8 %zmm3, %ymm2 +# INTEL: vcvtph2hf8 ymm2, zmm3 0x62,0xf5,0x7e,0x48,0x18,0xd3 -# ATT: vcvtneph2hf8 %zmm3, %ymm2 {%k7} -# INTEL: vcvtneph2hf8 ymm2 {k7}, zmm3 +# ATT: vcvtph2hf8 %zmm3, %ymm2 {%k7} +# INTEL: vcvtph2hf8 ymm2 {k7}, zmm3 0x62,0xf5,0x7e,0x4f,0x18,0xd3 -# ATT: vcvtneph2hf8 %zmm3, %ymm2 {%k7} {z} -# INTEL: vcvtneph2hf8 ymm2 {k7} {z}, zmm3 +# ATT: vcvtph2hf8 %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtph2hf8 ymm2 {k7} {z}, zmm3 0x62,0xf5,0x7e,0xcf,0x18,0xd3 -# ATT: vcvtneph2hf8 %ymm3, %xmm2 -# INTEL: vcvtneph2hf8 xmm2, ymm3 +# ATT: vcvtph2hf8 %ymm3, %xmm2 +# INTEL: vcvtph2hf8 xmm2, ymm3 0x62,0xf5,0x7e,0x28,0x18,0xd3 -# ATT: vcvtneph2hf8 %ymm3, %xmm2 {%k7} -# INTEL: vcvtneph2hf8 xmm2 {k7}, ymm3 +# ATT: vcvtph2hf8 %ymm3, %xmm2 {%k7} +# INTEL: vcvtph2hf8 xmm2 {k7}, ymm3 0x62,0xf5,0x7e,0x2f,0x18,0xd3 -# ATT: vcvtneph2hf8 %ymm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, ymm3 +# ATT: vcvtph2hf8 %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8 xmm2 {k7} {z}, ymm3 0x62,0xf5,0x7e,0xaf,0x18,0xd3 -# ATT: vcvtneph2hf8x 268435456(%esp,%esi,8), %xmm2 -# INTEL: vcvtneph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2hf8x 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x7e,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} -# INTEL: vcvtneph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x7e,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8 (%eax){1to8}, %xmm2 -# INTEL: vcvtneph2hf8 xmm2, word ptr [eax]{1to8} +# ATT: vcvtph2hf8 (%eax){1to8}, %xmm2 +# INTEL: vcvtph2hf8 xmm2, word ptr [eax]{1to8} 0x62,0xf5,0x7e,0x18,0x18,0x10 -# ATT: vcvtneph2hf8x -512(,%ebp,2), %xmm2 -# INTEL: vcvtneph2hf8 xmm2, xmmword ptr [2*ebp - 512] +# ATT: vcvtph2hf8x -512(,%ebp,2), %xmm2 +# INTEL: vcvtph2hf8 xmm2, xmmword ptr [2*ebp - 512] 0x62,0xf5,0x7e,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2hf8x 2032(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +# ATT: vcvtph2hf8x 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] 0x62,0xf5,0x7e,0x8f,0x18,0x51,0x7f -# ATT: vcvtneph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +# ATT: vcvtph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} 0x62,0xf5,0x7e,0x9f,0x18,0x52,0x80 -# ATT: vcvtneph2hf8 (%eax){1to16}, %xmm2 -# INTEL: vcvtneph2hf8 xmm2, word ptr [eax]{1to16} +# ATT: vcvtph2hf8 (%eax){1to16}, %xmm2 +# INTEL: vcvtph2hf8 xmm2, word ptr [eax]{1to16} 0x62,0xf5,0x7e,0x38,0x18,0x10 -# ATT: vcvtneph2hf8y -1024(,%ebp,2), %xmm2 -# INTEL: vcvtneph2hf8 xmm2, ymmword ptr [2*ebp - 1024] +# ATT: vcvtph2hf8y -1024(,%ebp,2), %xmm2 +# INTEL: vcvtph2hf8 xmm2, ymmword ptr [2*ebp - 1024] 0x62,0xf5,0x7e,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2hf8y 4064(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +# ATT: vcvtph2hf8y 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] 0x62,0xf5,0x7e,0xaf,0x18,0x51,0x7f -# ATT: vcvtneph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +# ATT: vcvtph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} 0x62,0xf5,0x7e,0xbf,0x18,0x52,0x80 -# ATT: vcvtneph2hf8 268435456(%esp,%esi,8), %ymm2 -# INTEL: vcvtneph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2hf8 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x7e,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8 291(%edi,%eax,4), %ymm2 {%k7} -# INTEL: vcvtneph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2hf8 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x7e,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8 (%eax){1to32}, %ymm2 -# INTEL: vcvtneph2hf8 ymm2, word ptr [eax]{1to32} +# ATT: vcvtph2hf8 (%eax){1to32}, %ymm2 +# INTEL: vcvtph2hf8 ymm2, word ptr [eax]{1to32} 0x62,0xf5,0x7e,0x58,0x18,0x10 -# ATT: vcvtneph2hf8 -2048(,%ebp,2), %ymm2 -# INTEL: vcvtneph2hf8 ymm2, zmmword ptr [2*ebp - 2048] +# ATT: vcvtph2hf8 -2048(,%ebp,2), %ymm2 +# INTEL: vcvtph2hf8 ymm2, zmmword ptr [2*ebp - 2048] 0x62,0xf5,0x7e,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2hf8 8128(%ecx), %ymm2 {%k7} {z} -# INTEL: vcvtneph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +# ATT: vcvtph2hf8 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] 0x62,0xf5,0x7e,0xcf,0x18,0x51,0x7f -# ATT: vcvtneph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} -# INTEL: vcvtneph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +# ATT: vcvtph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} 0x62,0xf5,0x7e,0xdf,0x18,0x52,0x80 -# ATT: vcvtneph2hf8s %xmm3, %xmm2 -# INTEL: vcvtneph2hf8s xmm2, xmm3 +# ATT: vcvtph2hf8s %xmm3, %xmm2 +# INTEL: vcvtph2hf8s xmm2, xmm3 0x62,0xf5,0x7e,0x08,0x1b,0xd3 -# ATT: vcvtneph2hf8s %xmm3, %xmm2 {%k7} -# INTEL: vcvtneph2hf8s xmm2 {k7}, xmm3 +# ATT: vcvtph2hf8s %xmm3, %xmm2 {%k7} +# INTEL: vcvtph2hf8s xmm2 {k7}, xmm3 0x62,0xf5,0x7e,0x0f,0x1b,0xd3 -# ATT: vcvtneph2hf8s %xmm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, xmm3 +# ATT: vcvtph2hf8s %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8s xmm2 {k7} {z}, xmm3 0x62,0xf5,0x7e,0x8f,0x1b,0xd3 -# ATT: vcvtneph2hf8s %zmm3, %ymm2 -# INTEL: vcvtneph2hf8s ymm2, zmm3 +# ATT: vcvtph2hf8s %zmm3, %ymm2 +# INTEL: vcvtph2hf8s ymm2, zmm3 0x62,0xf5,0x7e,0x48,0x1b,0xd3 -# ATT: vcvtneph2hf8s %zmm3, %ymm2 {%k7} -# INTEL: vcvtneph2hf8s ymm2 {k7}, zmm3 +# ATT: vcvtph2hf8s %zmm3, %ymm2 {%k7} +# INTEL: vcvtph2hf8s ymm2 {k7}, zmm3 0x62,0xf5,0x7e,0x4f,0x1b,0xd3 -# ATT: vcvtneph2hf8s %zmm3, %ymm2 {%k7} {z} -# INTEL: vcvtneph2hf8s ymm2 {k7} {z}, zmm3 +# ATT: vcvtph2hf8s %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtph2hf8s ymm2 {k7} {z}, zmm3 0x62,0xf5,0x7e,0xcf,0x1b,0xd3 -# ATT: vcvtneph2hf8s %ymm3, %xmm2 -# INTEL: vcvtneph2hf8s xmm2, ymm3 +# ATT: vcvtph2hf8s %ymm3, %xmm2 +# INTEL: vcvtph2hf8s xmm2, ymm3 0x62,0xf5,0x7e,0x28,0x1b,0xd3 -# ATT: vcvtneph2hf8s %ymm3, %xmm2 {%k7} -# INTEL: vcvtneph2hf8s xmm2 {k7}, ymm3 +# ATT: vcvtph2hf8s %ymm3, %xmm2 {%k7} +# INTEL: vcvtph2hf8s xmm2 {k7}, ymm3 0x62,0xf5,0x7e,0x2f,0x1b,0xd3 -# ATT: vcvtneph2hf8s %ymm3, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, ymm3 +# ATT: vcvtph2hf8s %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8s xmm2 {k7} {z}, ymm3 0x62,0xf5,0x7e,0xaf,0x1b,0xd3 -# ATT: vcvtneph2hf8sx 268435456(%esp,%esi,8), %xmm2 -# INTEL: vcvtneph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2hf8sx 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x7e,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} -# INTEL: vcvtneph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x7e,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8s (%eax){1to8}, %xmm2 -# INTEL: vcvtneph2hf8s xmm2, word ptr [eax]{1to8} +# ATT: vcvtph2hf8s (%eax){1to8}, %xmm2 +# INTEL: vcvtph2hf8s xmm2, word ptr [eax]{1to8} 0x62,0xf5,0x7e,0x18,0x1b,0x10 -# ATT: vcvtneph2hf8sx -512(,%ebp,2), %xmm2 -# INTEL: vcvtneph2hf8s xmm2, xmmword ptr [2*ebp - 512] +# ATT: vcvtph2hf8sx -512(,%ebp,2), %xmm2 +# INTEL: vcvtph2hf8s xmm2, xmmword ptr [2*ebp - 512] 0x62,0xf5,0x7e,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +# ATT: vcvtph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] 0x62,0xf5,0x7e,0x8f,0x1b,0x51,0x7f -# ATT: vcvtneph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +# ATT: vcvtph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} 0x62,0xf5,0x7e,0x9f,0x1b,0x52,0x80 -# ATT: vcvtneph2hf8s (%eax){1to16}, %xmm2 -# INTEL: vcvtneph2hf8s xmm2, word ptr [eax]{1to16} +# ATT: vcvtph2hf8s (%eax){1to16}, %xmm2 +# INTEL: vcvtph2hf8s xmm2, word ptr [eax]{1to16} 0x62,0xf5,0x7e,0x38,0x1b,0x10 -# ATT: vcvtneph2hf8sy -1024(,%ebp,2), %xmm2 -# INTEL: vcvtneph2hf8s xmm2, ymmword ptr [2*ebp - 1024] +# ATT: vcvtph2hf8sy -1024(,%ebp,2), %xmm2 +# INTEL: vcvtph2hf8s xmm2, ymmword ptr [2*ebp - 1024] 0x62,0xf5,0x7e,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +# ATT: vcvtph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] 0x62,0xf5,0x7e,0xaf,0x1b,0x51,0x7f -# ATT: vcvtneph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +# ATT: vcvtph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} 0x62,0xf5,0x7e,0xbf,0x1b,0x52,0x80 -# ATT: vcvtneph2hf8s 268435456(%esp,%esi,8), %ymm2 -# INTEL: vcvtneph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +# ATT: vcvtph2hf8s 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] 0x62,0xf5,0x7e,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} -# INTEL: vcvtneph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +# ATT: vcvtph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] 0x62,0xf5,0x7e,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8s (%eax){1to32}, %ymm2 -# INTEL: vcvtneph2hf8s ymm2, word ptr [eax]{1to32} +# ATT: vcvtph2hf8s (%eax){1to32}, %ymm2 +# INTEL: vcvtph2hf8s ymm2, word ptr [eax]{1to32} 0x62,0xf5,0x7e,0x58,0x1b,0x10 -# ATT: vcvtneph2hf8s -2048(,%ebp,2), %ymm2 -# INTEL: vcvtneph2hf8s ymm2, zmmword ptr [2*ebp - 2048] +# ATT: vcvtph2hf8s -2048(,%ebp,2), %ymm2 +# INTEL: vcvtph2hf8s ymm2, zmmword ptr [2*ebp - 2048] 0x62,0xf5,0x7e,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2hf8s 8128(%ecx), %ymm2 {%k7} {z} -# INTEL: vcvtneph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +# ATT: vcvtph2hf8s 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] 0x62,0xf5,0x7e,0xcf,0x1b,0x51,0x7f -# ATT: vcvtneph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} -# INTEL: vcvtneph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +# ATT: vcvtph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} 0x62,0xf5,0x7e,0xdf,0x1b,0x52,0x80 diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt index 82bf09c49e9260..611a584df87cfe 100644 --- a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt +++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt @@ -657,835 +657,835 @@ # INTEL: vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rdx - 4096] 0x62,0xe5,0x7f,0xcf,0x1e,0x72,0x80 -# ATT: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8 ymm22, ymm23, ymm24 +# ATT: vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8 ymm22, ymm23, ymm24 0x62,0x82,0x47,0x20,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymm24 +# ATT: vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2bf8 ymm22 {k7}, ymm23, ymm24 0x62,0x82,0x47,0x27,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 +# ATT: vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 0x62,0x82,0x47,0xa7,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8 zmm22, zmm23, zmm24 +# ATT: vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8 zmm22, zmm23, zmm24 0x62,0x82,0x47,0x40,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmm24 +# ATT: vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2bf8 zmm22 {k7}, zmm23, zmm24 0x62,0x82,0x47,0x47,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 +# ATT: vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 0x62,0x82,0x47,0xc7,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8 xmm22, xmm23, xmm24 +# ATT: vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8 xmm22, xmm23, xmm24 0x62,0x82,0x47,0x00,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmm24 +# ATT: vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2bf8 xmm22 {k7}, xmm23, xmm24 0x62,0x82,0x47,0x07,0x74,0xf0 -# ATT: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 +# ATT: vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 0x62,0x82,0x47,0x87,0x74,0xf0 -# ATT: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa2,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc2,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} +# ATT: vcvt2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} 0x62,0xe2,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +# ATT: vcvt2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] 0x62,0xe2,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +# ATT: vcvt2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] 0x62,0xe2,0x47,0xc7,0x74,0x71,0x7f -# ATT: vcvtne2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +# ATT: vcvt2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} 0x62,0xe2,0x47,0xd7,0x74,0x72,0x80 -# ATT: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa2,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] 0x62,0xc2,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} +# ATT: vcvt2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} 0x62,0xe2,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +# ATT: vcvt2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] 0x62,0xe2,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +# ATT: vcvt2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] 0x62,0xe2,0x47,0xa7,0x74,0x71,0x7f -# ATT: vcvtne2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +# ATT: vcvt2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} 0x62,0xe2,0x47,0xb7,0x74,0x72,0x80 -# ATT: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa2,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc2,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} +# ATT: vcvt2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} 0x62,0xe2,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +# ATT: vcvt2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] 0x62,0xe2,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +# ATT: vcvt2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] 0x62,0xe2,0x47,0x87,0x74,0x71,0x7f -# ATT: vcvtne2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +# ATT: vcvt2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} 0x62,0xe2,0x47,0x97,0x74,0x72,0x80 -# ATT: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8s ymm22, ymm23, ymm24 +# ATT: vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8s ymm22, ymm23, ymm24 0x62,0x85,0x47,0x20,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymm24 +# ATT: vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2bf8s ymm22 {k7}, ymm23, ymm24 0x62,0x85,0x47,0x27,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 +# ATT: vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 0x62,0x85,0x47,0xa7,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8s zmm22, zmm23, zmm24 +# ATT: vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8s zmm22, zmm23, zmm24 0x62,0x85,0x47,0x40,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmm24 +# ATT: vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2bf8s zmm22 {k7}, zmm23, zmm24 0x62,0x85,0x47,0x47,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 +# ATT: vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 0x62,0x85,0x47,0xc7,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8s xmm22, xmm23, xmm24 +# ATT: vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8s xmm22, xmm23, xmm24 0x62,0x85,0x47,0x00,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmm24 +# ATT: vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2bf8s xmm22 {k7}, xmm23, xmm24 0x62,0x85,0x47,0x07,0x74,0xf0 -# ATT: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 +# ATT: vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 0x62,0x85,0x47,0x87,0x74,0xf0 -# ATT: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} +# ATT: vcvt2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} 0x62,0xe5,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 -# INTEL: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +# ATT: vcvt2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvt2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] 0x62,0xe5,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +# ATT: vcvt2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] 0x62,0xe5,0x47,0xc7,0x74,0x71,0x7f -# ATT: vcvtne2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +# ATT: vcvt2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} 0x62,0xe5,0x47,0xd7,0x74,0x72,0x80 -# ATT: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} +# ATT: vcvt2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} 0x62,0xe5,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 -# INTEL: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +# ATT: vcvt2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvt2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] 0x62,0xe5,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +# ATT: vcvt2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] 0x62,0xe5,0x47,0xa7,0x74,0x71,0x7f -# ATT: vcvtne2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +# ATT: vcvt2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} 0x62,0xe5,0x47,0xb7,0x74,0x72,0x80 -# ATT: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} +# ATT: vcvt2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} 0x62,0xe5,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 -# INTEL: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +# ATT: vcvt2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvt2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] 0x62,0xe5,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +# ATT: vcvt2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] 0x62,0xe5,0x47,0x87,0x74,0x71,0x7f -# ATT: vcvtne2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +# ATT: vcvt2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} 0x62,0xe5,0x47,0x97,0x74,0x72,0x80 -# ATT: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8 ymm22, ymm23, ymm24 +# ATT: vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8 ymm22, ymm23, ymm24 0x62,0x85,0x47,0x20,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymm24 +# ATT: vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2hf8 ymm22 {k7}, ymm23, ymm24 0x62,0x85,0x47,0x27,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 +# ATT: vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 0x62,0x85,0x47,0xa7,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8 zmm22, zmm23, zmm24 +# ATT: vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8 zmm22, zmm23, zmm24 0x62,0x85,0x47,0x40,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmm24 +# ATT: vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2hf8 zmm22 {k7}, zmm23, zmm24 0x62,0x85,0x47,0x47,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 +# ATT: vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 0x62,0x85,0x47,0xc7,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8 xmm22, xmm23, xmm24 +# ATT: vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8 xmm22, xmm23, xmm24 0x62,0x85,0x47,0x00,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmm24 +# ATT: vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2hf8 xmm22 {k7}, xmm23, xmm24 0x62,0x85,0x47,0x07,0x18,0xf0 -# ATT: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 +# ATT: vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 0x62,0x85,0x47,0x87,0x18,0xf0 -# ATT: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} +# ATT: vcvt2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} 0x62,0xe5,0x47,0x50,0x18,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +# ATT: vcvt2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] 0x62,0xe5,0x47,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +# ATT: vcvt2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] 0x62,0xe5,0x47,0xc7,0x18,0x71,0x7f -# ATT: vcvtne2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +# ATT: vcvt2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} 0x62,0xe5,0x47,0xd7,0x18,0x72,0x80 -# ATT: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} +# ATT: vcvt2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} 0x62,0xe5,0x47,0x30,0x18,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +# ATT: vcvt2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] 0x62,0xe5,0x47,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +# ATT: vcvt2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] 0x62,0xe5,0x47,0xa7,0x18,0x71,0x7f -# ATT: vcvtne2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +# ATT: vcvt2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} 0x62,0xe5,0x47,0xb7,0x18,0x72,0x80 -# ATT: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} +# ATT: vcvt2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} 0x62,0xe5,0x47,0x10,0x18,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +# ATT: vcvt2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] 0x62,0xe5,0x47,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +# ATT: vcvt2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] 0x62,0xe5,0x47,0x87,0x18,0x71,0x7f -# ATT: vcvtne2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +# ATT: vcvt2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} 0x62,0xe5,0x47,0x97,0x18,0x72,0x80 -# ATT: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8s ymm22, ymm23, ymm24 +# ATT: vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8s ymm22, ymm23, ymm24 0x62,0x85,0x47,0x20,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymm24 +# ATT: vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2hf8s ymm22 {k7}, ymm23, ymm24 0x62,0x85,0x47,0x27,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 +# ATT: vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 0x62,0x85,0x47,0xa7,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8s zmm22, zmm23, zmm24 +# ATT: vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8s zmm22, zmm23, zmm24 0x62,0x85,0x47,0x40,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmm24 +# ATT: vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2hf8s zmm22 {k7}, zmm23, zmm24 0x62,0x85,0x47,0x47,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 +# ATT: vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 0x62,0x85,0x47,0xc7,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8s xmm22, xmm23, xmm24 +# ATT: vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8s xmm22, xmm23, xmm24 0x62,0x85,0x47,0x00,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmm24 +# ATT: vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2hf8s xmm22 {k7}, xmm23, xmm24 0x62,0x85,0x47,0x07,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 +# ATT: vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 0x62,0x85,0x47,0x87,0x1b,0xf0 -# ATT: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -# INTEL: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} +# ATT: vcvt2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} 0x62,0xe5,0x47,0x50,0x1b,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 -# INTEL: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +# ATT: vcvt2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvt2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] 0x62,0xe5,0x47,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtne2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +# ATT: vcvt2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] 0x62,0xe5,0x47,0xc7,0x1b,0x71,0x7f -# ATT: vcvtne2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +# ATT: vcvt2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} 0x62,0xe5,0x47,0xd7,0x1b,0x72,0x80 -# ATT: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -# INTEL: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} +# ATT: vcvt2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} 0x62,0xe5,0x47,0x30,0x1b,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 -# INTEL: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +# ATT: vcvt2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvt2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] 0x62,0xe5,0x47,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtne2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +# ATT: vcvt2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] 0x62,0xe5,0x47,0xa7,0x1b,0x71,0x7f -# ATT: vcvtne2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +# ATT: vcvt2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} 0x62,0xe5,0x47,0xb7,0x1b,0x72,0x80 -# ATT: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvt2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x47,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtne2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -# INTEL: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvt2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x47,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtne2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} +# ATT: vcvt2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} 0x62,0xe5,0x47,0x10,0x1b,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtne2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 -# INTEL: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +# ATT: vcvt2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvt2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] 0x62,0xe5,0x47,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtne2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +# ATT: vcvt2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] 0x62,0xe5,0x47,0x87,0x1b,0x71,0x7f -# ATT: vcvtne2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +# ATT: vcvt2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} 0x62,0xe5,0x47,0x97,0x1b,0x72,0x80 -# ATT: vcvtneph2bf8 %xmm23, %xmm22 -# INTEL: vcvtneph2bf8 xmm22, xmm23 +# ATT: vcvtph2bf8 %xmm23, %xmm22 +# INTEL: vcvtph2bf8 xmm22, xmm23 0x62,0xa2,0x7e,0x08,0x74,0xf7 -# ATT: vcvtneph2bf8 %xmm23, %xmm22 {%k7} -# INTEL: vcvtneph2bf8 xmm22 {k7}, xmm23 +# ATT: vcvtph2bf8 %xmm23, %xmm22 {%k7} +# INTEL: vcvtph2bf8 xmm22 {k7}, xmm23 0x62,0xa2,0x7e,0x0f,0x74,0xf7 -# ATT: vcvtneph2bf8 %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, xmm23 +# ATT: vcvtph2bf8 %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8 xmm22 {k7} {z}, xmm23 0x62,0xa2,0x7e,0x8f,0x74,0xf7 -# ATT: vcvtneph2bf8 %zmm23, %ymm22 -# INTEL: vcvtneph2bf8 ymm22, zmm23 +# ATT: vcvtph2bf8 %zmm23, %ymm22 +# INTEL: vcvtph2bf8 ymm22, zmm23 0x62,0xa2,0x7e,0x48,0x74,0xf7 -# ATT: vcvtneph2bf8 %zmm23, %ymm22 {%k7} -# INTEL: vcvtneph2bf8 ymm22 {k7}, zmm23 +# ATT: vcvtph2bf8 %zmm23, %ymm22 {%k7} +# INTEL: vcvtph2bf8 ymm22 {k7}, zmm23 0x62,0xa2,0x7e,0x4f,0x74,0xf7 -# ATT: vcvtneph2bf8 %zmm23, %ymm22 {%k7} {z} -# INTEL: vcvtneph2bf8 ymm22 {k7} {z}, zmm23 +# ATT: vcvtph2bf8 %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtph2bf8 ymm22 {k7} {z}, zmm23 0x62,0xa2,0x7e,0xcf,0x74,0xf7 -# ATT: vcvtneph2bf8 %ymm23, %xmm22 -# INTEL: vcvtneph2bf8 xmm22, ymm23 +# ATT: vcvtph2bf8 %ymm23, %xmm22 +# INTEL: vcvtph2bf8 xmm22, ymm23 0x62,0xa2,0x7e,0x28,0x74,0xf7 -# ATT: vcvtneph2bf8 %ymm23, %xmm22 {%k7} -# INTEL: vcvtneph2bf8 xmm22 {k7}, ymm23 +# ATT: vcvtph2bf8 %ymm23, %xmm22 {%k7} +# INTEL: vcvtph2bf8 xmm22 {k7}, ymm23 0x62,0xa2,0x7e,0x2f,0x74,0xf7 -# ATT: vcvtneph2bf8 %ymm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, ymm23 +# ATT: vcvtph2bf8 %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8 xmm22 {k7} {z}, ymm23 0x62,0xa2,0x7e,0xaf,0x74,0xf7 -# ATT: vcvtneph2bf8x 268435456(%rbp,%r14,8), %xmm22 -# INTEL: vcvtneph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2bf8x 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa2,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} -# INTEL: vcvtneph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc2,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8 (%rip){1to8}, %xmm22 -# INTEL: vcvtneph2bf8 xmm22, word ptr [rip]{1to8} +# ATT: vcvtph2bf8 (%rip){1to8}, %xmm22 +# INTEL: vcvtph2bf8 xmm22, word ptr [rip]{1to8} 0x62,0xe2,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2bf8x -512(,%rbp,2), %xmm22 -# INTEL: vcvtneph2bf8 xmm22, xmmword ptr [2*rbp - 512] +# ATT: vcvtph2bf8x -512(,%rbp,2), %xmm22 +# INTEL: vcvtph2bf8 xmm22, xmmword ptr [2*rbp - 512] 0x62,0xe2,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2bf8x 2032(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +# ATT: vcvtph2bf8x 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] 0x62,0xe2,0x7e,0x8f,0x74,0x71,0x7f -# ATT: vcvtneph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +# ATT: vcvtph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} 0x62,0xe2,0x7e,0x9f,0x74,0x72,0x80 -# ATT: vcvtneph2bf8 (%rip){1to16}, %xmm22 -# INTEL: vcvtneph2bf8 xmm22, word ptr [rip]{1to16} +# ATT: vcvtph2bf8 (%rip){1to16}, %xmm22 +# INTEL: vcvtph2bf8 xmm22, word ptr [rip]{1to16} 0x62,0xe2,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2bf8y -1024(,%rbp,2), %xmm22 -# INTEL: vcvtneph2bf8 xmm22, ymmword ptr [2*rbp - 1024] +# ATT: vcvtph2bf8y -1024(,%rbp,2), %xmm22 +# INTEL: vcvtph2bf8 xmm22, ymmword ptr [2*rbp - 1024] 0x62,0xe2,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2bf8y 4064(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +# ATT: vcvtph2bf8y 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] 0x62,0xe2,0x7e,0xaf,0x74,0x71,0x7f -# ATT: vcvtneph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +# ATT: vcvtph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} 0x62,0xe2,0x7e,0xbf,0x74,0x72,0x80 -# ATT: vcvtneph2bf8 268435456(%rbp,%r14,8), %ymm22 -# INTEL: vcvtneph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2bf8 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa2,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8 291(%r8,%rax,4), %ymm22 {%k7} -# INTEL: vcvtneph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2bf8 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc2,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8 (%rip){1to32}, %ymm22 -# INTEL: vcvtneph2bf8 ymm22, word ptr [rip]{1to32} +# ATT: vcvtph2bf8 (%rip){1to32}, %ymm22 +# INTEL: vcvtph2bf8 ymm22, word ptr [rip]{1to32} 0x62,0xe2,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2bf8 -2048(,%rbp,2), %ymm22 -# INTEL: vcvtneph2bf8 ymm22, zmmword ptr [2*rbp - 2048] +# ATT: vcvtph2bf8 -2048(,%rbp,2), %ymm22 +# INTEL: vcvtph2bf8 ymm22, zmmword ptr [2*rbp - 2048] 0x62,0xe2,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2bf8 8128(%rcx), %ymm22 {%k7} {z} -# INTEL: vcvtneph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +# ATT: vcvtph2bf8 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] 0x62,0xe2,0x7e,0xcf,0x74,0x71,0x7f -# ATT: vcvtneph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} -# INTEL: vcvtneph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +# ATT: vcvtph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} 0x62,0xe2,0x7e,0xdf,0x74,0x72,0x80 -# ATT: vcvtneph2bf8s %xmm23, %xmm22 -# INTEL: vcvtneph2bf8s xmm22, xmm23 +# ATT: vcvtph2bf8s %xmm23, %xmm22 +# INTEL: vcvtph2bf8s xmm22, xmm23 0x62,0xa5,0x7e,0x08,0x74,0xf7 -# ATT: vcvtneph2bf8s %xmm23, %xmm22 {%k7} -# INTEL: vcvtneph2bf8s xmm22 {k7}, xmm23 +# ATT: vcvtph2bf8s %xmm23, %xmm22 {%k7} +# INTEL: vcvtph2bf8s xmm22 {k7}, xmm23 0x62,0xa5,0x7e,0x0f,0x74,0xf7 -# ATT: vcvtneph2bf8s %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, xmm23 +# ATT: vcvtph2bf8s %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8s xmm22 {k7} {z}, xmm23 0x62,0xa5,0x7e,0x8f,0x74,0xf7 -# ATT: vcvtneph2bf8s %zmm23, %ymm22 -# INTEL: vcvtneph2bf8s ymm22, zmm23 +# ATT: vcvtph2bf8s %zmm23, %ymm22 +# INTEL: vcvtph2bf8s ymm22, zmm23 0x62,0xa5,0x7e,0x48,0x74,0xf7 -# ATT: vcvtneph2bf8s %zmm23, %ymm22 {%k7} -# INTEL: vcvtneph2bf8s ymm22 {k7}, zmm23 +# ATT: vcvtph2bf8s %zmm23, %ymm22 {%k7} +# INTEL: vcvtph2bf8s ymm22 {k7}, zmm23 0x62,0xa5,0x7e,0x4f,0x74,0xf7 -# ATT: vcvtneph2bf8s %zmm23, %ymm22 {%k7} {z} -# INTEL: vcvtneph2bf8s ymm22 {k7} {z}, zmm23 +# ATT: vcvtph2bf8s %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtph2bf8s ymm22 {k7} {z}, zmm23 0x62,0xa5,0x7e,0xcf,0x74,0xf7 -# ATT: vcvtneph2bf8s %ymm23, %xmm22 -# INTEL: vcvtneph2bf8s xmm22, ymm23 +# ATT: vcvtph2bf8s %ymm23, %xmm22 +# INTEL: vcvtph2bf8s xmm22, ymm23 0x62,0xa5,0x7e,0x28,0x74,0xf7 -# ATT: vcvtneph2bf8s %ymm23, %xmm22 {%k7} -# INTEL: vcvtneph2bf8s xmm22 {k7}, ymm23 +# ATT: vcvtph2bf8s %ymm23, %xmm22 {%k7} +# INTEL: vcvtph2bf8s xmm22 {k7}, ymm23 0x62,0xa5,0x7e,0x2f,0x74,0xf7 -# ATT: vcvtneph2bf8s %ymm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, ymm23 +# ATT: vcvtph2bf8s %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8s xmm22 {k7} {z}, ymm23 0x62,0xa5,0x7e,0xaf,0x74,0xf7 -# ATT: vcvtneph2bf8sx 268435456(%rbp,%r14,8), %xmm22 -# INTEL: vcvtneph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2bf8sx 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} -# INTEL: vcvtneph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8s (%rip){1to8}, %xmm22 -# INTEL: vcvtneph2bf8s xmm22, word ptr [rip]{1to8} +# ATT: vcvtph2bf8s (%rip){1to8}, %xmm22 +# INTEL: vcvtph2bf8s xmm22, word ptr [rip]{1to8} 0x62,0xe5,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2bf8sx -512(,%rbp,2), %xmm22 -# INTEL: vcvtneph2bf8s xmm22, xmmword ptr [2*rbp - 512] +# ATT: vcvtph2bf8sx -512(,%rbp,2), %xmm22 +# INTEL: vcvtph2bf8s xmm22, xmmword ptr [2*rbp - 512] 0x62,0xe5,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +# ATT: vcvtph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] 0x62,0xe5,0x7e,0x8f,0x74,0x71,0x7f -# ATT: vcvtneph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +# ATT: vcvtph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} 0x62,0xe5,0x7e,0x9f,0x74,0x72,0x80 -# ATT: vcvtneph2bf8s (%rip){1to16}, %xmm22 -# INTEL: vcvtneph2bf8s xmm22, word ptr [rip]{1to16} +# ATT: vcvtph2bf8s (%rip){1to16}, %xmm22 +# INTEL: vcvtph2bf8s xmm22, word ptr [rip]{1to16} 0x62,0xe5,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2bf8sy -1024(,%rbp,2), %xmm22 -# INTEL: vcvtneph2bf8s xmm22, ymmword ptr [2*rbp - 1024] +# ATT: vcvtph2bf8sy -1024(,%rbp,2), %xmm22 +# INTEL: vcvtph2bf8s xmm22, ymmword ptr [2*rbp - 1024] 0x62,0xe5,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +# ATT: vcvtph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] 0x62,0xe5,0x7e,0xaf,0x74,0x71,0x7f -# ATT: vcvtneph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +# ATT: vcvtph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} 0x62,0xe5,0x7e,0xbf,0x74,0x72,0x80 -# ATT: vcvtneph2bf8s 268435456(%rbp,%r14,8), %ymm22 -# INTEL: vcvtneph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2bf8s 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} -# INTEL: vcvtneph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2bf8s (%rip){1to32}, %ymm22 -# INTEL: vcvtneph2bf8s ymm22, word ptr [rip]{1to32} +# ATT: vcvtph2bf8s (%rip){1to32}, %ymm22 +# INTEL: vcvtph2bf8s ymm22, word ptr [rip]{1to32} 0x62,0xe5,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2bf8s -2048(,%rbp,2), %ymm22 -# INTEL: vcvtneph2bf8s ymm22, zmmword ptr [2*rbp - 2048] +# ATT: vcvtph2bf8s -2048(,%rbp,2), %ymm22 +# INTEL: vcvtph2bf8s ymm22, zmmword ptr [2*rbp - 2048] 0x62,0xe5,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2bf8s 8128(%rcx), %ymm22 {%k7} {z} -# INTEL: vcvtneph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +# ATT: vcvtph2bf8s 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] 0x62,0xe5,0x7e,0xcf,0x74,0x71,0x7f -# ATT: vcvtneph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} -# INTEL: vcvtneph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +# ATT: vcvtph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} 0x62,0xe5,0x7e,0xdf,0x74,0x72,0x80 -# ATT: vcvtneph2hf8 %xmm23, %xmm22 -# INTEL: vcvtneph2hf8 xmm22, xmm23 +# ATT: vcvtph2hf8 %xmm23, %xmm22 +# INTEL: vcvtph2hf8 xmm22, xmm23 0x62,0xa5,0x7e,0x08,0x18,0xf7 -# ATT: vcvtneph2hf8 %xmm23, %xmm22 {%k7} -# INTEL: vcvtneph2hf8 xmm22 {k7}, xmm23 +# ATT: vcvtph2hf8 %xmm23, %xmm22 {%k7} +# INTEL: vcvtph2hf8 xmm22 {k7}, xmm23 0x62,0xa5,0x7e,0x0f,0x18,0xf7 -# ATT: vcvtneph2hf8 %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, xmm23 +# ATT: vcvtph2hf8 %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8 xmm22 {k7} {z}, xmm23 0x62,0xa5,0x7e,0x8f,0x18,0xf7 -# ATT: vcvtneph2hf8 %zmm23, %ymm22 -# INTEL: vcvtneph2hf8 ymm22, zmm23 +# ATT: vcvtph2hf8 %zmm23, %ymm22 +# INTEL: vcvtph2hf8 ymm22, zmm23 0x62,0xa5,0x7e,0x48,0x18,0xf7 -# ATT: vcvtneph2hf8 %zmm23, %ymm22 {%k7} -# INTEL: vcvtneph2hf8 ymm22 {k7}, zmm23 +# ATT: vcvtph2hf8 %zmm23, %ymm22 {%k7} +# INTEL: vcvtph2hf8 ymm22 {k7}, zmm23 0x62,0xa5,0x7e,0x4f,0x18,0xf7 -# ATT: vcvtneph2hf8 %zmm23, %ymm22 {%k7} {z} -# INTEL: vcvtneph2hf8 ymm22 {k7} {z}, zmm23 +# ATT: vcvtph2hf8 %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtph2hf8 ymm22 {k7} {z}, zmm23 0x62,0xa5,0x7e,0xcf,0x18,0xf7 -# ATT: vcvtneph2hf8 %ymm23, %xmm22 -# INTEL: vcvtneph2hf8 xmm22, ymm23 +# ATT: vcvtph2hf8 %ymm23, %xmm22 +# INTEL: vcvtph2hf8 xmm22, ymm23 0x62,0xa5,0x7e,0x28,0x18,0xf7 -# ATT: vcvtneph2hf8 %ymm23, %xmm22 {%k7} -# INTEL: vcvtneph2hf8 xmm22 {k7}, ymm23 +# ATT: vcvtph2hf8 %ymm23, %xmm22 {%k7} +# INTEL: vcvtph2hf8 xmm22 {k7}, ymm23 0x62,0xa5,0x7e,0x2f,0x18,0xf7 -# ATT: vcvtneph2hf8 %ymm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, ymm23 +# ATT: vcvtph2hf8 %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8 xmm22 {k7} {z}, ymm23 0x62,0xa5,0x7e,0xaf,0x18,0xf7 -# ATT: vcvtneph2hf8x 268435456(%rbp,%r14,8), %xmm22 -# INTEL: vcvtneph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2hf8x 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x7e,0x08,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} -# INTEL: vcvtneph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x7e,0x0f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8 (%rip){1to8}, %xmm22 -# INTEL: vcvtneph2hf8 xmm22, word ptr [rip]{1to8} +# ATT: vcvtph2hf8 (%rip){1to8}, %xmm22 +# INTEL: vcvtph2hf8 xmm22, word ptr [rip]{1to8} 0x62,0xe5,0x7e,0x18,0x18,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2hf8x -512(,%rbp,2), %xmm22 -# INTEL: vcvtneph2hf8 xmm22, xmmword ptr [2*rbp - 512] +# ATT: vcvtph2hf8x -512(,%rbp,2), %xmm22 +# INTEL: vcvtph2hf8 xmm22, xmmword ptr [2*rbp - 512] 0x62,0xe5,0x7e,0x08,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2hf8x 2032(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +# ATT: vcvtph2hf8x 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] 0x62,0xe5,0x7e,0x8f,0x18,0x71,0x7f -# ATT: vcvtneph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +# ATT: vcvtph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} 0x62,0xe5,0x7e,0x9f,0x18,0x72,0x80 -# ATT: vcvtneph2hf8 (%rip){1to16}, %xmm22 -# INTEL: vcvtneph2hf8 xmm22, word ptr [rip]{1to16} +# ATT: vcvtph2hf8 (%rip){1to16}, %xmm22 +# INTEL: vcvtph2hf8 xmm22, word ptr [rip]{1to16} 0x62,0xe5,0x7e,0x38,0x18,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2hf8y -1024(,%rbp,2), %xmm22 -# INTEL: vcvtneph2hf8 xmm22, ymmword ptr [2*rbp - 1024] +# ATT: vcvtph2hf8y -1024(,%rbp,2), %xmm22 +# INTEL: vcvtph2hf8 xmm22, ymmword ptr [2*rbp - 1024] 0x62,0xe5,0x7e,0x28,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2hf8y 4064(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +# ATT: vcvtph2hf8y 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] 0x62,0xe5,0x7e,0xaf,0x18,0x71,0x7f -# ATT: vcvtneph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +# ATT: vcvtph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} 0x62,0xe5,0x7e,0xbf,0x18,0x72,0x80 -# ATT: vcvtneph2hf8 268435456(%rbp,%r14,8), %ymm22 -# INTEL: vcvtneph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2hf8 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x7e,0x48,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8 291(%r8,%rax,4), %ymm22 {%k7} -# INTEL: vcvtneph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2hf8 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x7e,0x4f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8 (%rip){1to32}, %ymm22 -# INTEL: vcvtneph2hf8 ymm22, word ptr [rip]{1to32} +# ATT: vcvtph2hf8 (%rip){1to32}, %ymm22 +# INTEL: vcvtph2hf8 ymm22, word ptr [rip]{1to32} 0x62,0xe5,0x7e,0x58,0x18,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2hf8 -2048(,%rbp,2), %ymm22 -# INTEL: vcvtneph2hf8 ymm22, zmmword ptr [2*rbp - 2048] +# ATT: vcvtph2hf8 -2048(,%rbp,2), %ymm22 +# INTEL: vcvtph2hf8 ymm22, zmmword ptr [2*rbp - 2048] 0x62,0xe5,0x7e,0x48,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2hf8 8128(%rcx), %ymm22 {%k7} {z} -# INTEL: vcvtneph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +# ATT: vcvtph2hf8 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] 0x62,0xe5,0x7e,0xcf,0x18,0x71,0x7f -# ATT: vcvtneph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} -# INTEL: vcvtneph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +# ATT: vcvtph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} 0x62,0xe5,0x7e,0xdf,0x18,0x72,0x80 -# ATT: vcvtneph2hf8s %xmm23, %xmm22 -# INTEL: vcvtneph2hf8s xmm22, xmm23 +# ATT: vcvtph2hf8s %xmm23, %xmm22 +# INTEL: vcvtph2hf8s xmm22, xmm23 0x62,0xa5,0x7e,0x08,0x1b,0xf7 -# ATT: vcvtneph2hf8s %xmm23, %xmm22 {%k7} -# INTEL: vcvtneph2hf8s xmm22 {k7}, xmm23 +# ATT: vcvtph2hf8s %xmm23, %xmm22 {%k7} +# INTEL: vcvtph2hf8s xmm22 {k7}, xmm23 0x62,0xa5,0x7e,0x0f,0x1b,0xf7 -# ATT: vcvtneph2hf8s %xmm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, xmm23 +# ATT: vcvtph2hf8s %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8s xmm22 {k7} {z}, xmm23 0x62,0xa5,0x7e,0x8f,0x1b,0xf7 -# ATT: vcvtneph2hf8s %zmm23, %ymm22 -# INTEL: vcvtneph2hf8s ymm22, zmm23 +# ATT: vcvtph2hf8s %zmm23, %ymm22 +# INTEL: vcvtph2hf8s ymm22, zmm23 0x62,0xa5,0x7e,0x48,0x1b,0xf7 -# ATT: vcvtneph2hf8s %zmm23, %ymm22 {%k7} -# INTEL: vcvtneph2hf8s ymm22 {k7}, zmm23 +# ATT: vcvtph2hf8s %zmm23, %ymm22 {%k7} +# INTEL: vcvtph2hf8s ymm22 {k7}, zmm23 0x62,0xa5,0x7e,0x4f,0x1b,0xf7 -# ATT: vcvtneph2hf8s %zmm23, %ymm22 {%k7} {z} -# INTEL: vcvtneph2hf8s ymm22 {k7} {z}, zmm23 +# ATT: vcvtph2hf8s %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtph2hf8s ymm22 {k7} {z}, zmm23 0x62,0xa5,0x7e,0xcf,0x1b,0xf7 -# ATT: vcvtneph2hf8s %ymm23, %xmm22 -# INTEL: vcvtneph2hf8s xmm22, ymm23 +# ATT: vcvtph2hf8s %ymm23, %xmm22 +# INTEL: vcvtph2hf8s xmm22, ymm23 0x62,0xa5,0x7e,0x28,0x1b,0xf7 -# ATT: vcvtneph2hf8s %ymm23, %xmm22 {%k7} -# INTEL: vcvtneph2hf8s xmm22 {k7}, ymm23 +# ATT: vcvtph2hf8s %ymm23, %xmm22 {%k7} +# INTEL: vcvtph2hf8s xmm22 {k7}, ymm23 0x62,0xa5,0x7e,0x2f,0x1b,0xf7 -# ATT: vcvtneph2hf8s %ymm23, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, ymm23 +# ATT: vcvtph2hf8s %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8s xmm22 {k7} {z}, ymm23 0x62,0xa5,0x7e,0xaf,0x1b,0xf7 -# ATT: vcvtneph2hf8sx 268435456(%rbp,%r14,8), %xmm22 -# INTEL: vcvtneph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2hf8sx 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x7e,0x08,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} -# INTEL: vcvtneph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x7e,0x0f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8s (%rip){1to8}, %xmm22 -# INTEL: vcvtneph2hf8s xmm22, word ptr [rip]{1to8} +# ATT: vcvtph2hf8s (%rip){1to8}, %xmm22 +# INTEL: vcvtph2hf8s xmm22, word ptr [rip]{1to8} 0x62,0xe5,0x7e,0x18,0x1b,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2hf8sx -512(,%rbp,2), %xmm22 -# INTEL: vcvtneph2hf8s xmm22, xmmword ptr [2*rbp - 512] +# ATT: vcvtph2hf8sx -512(,%rbp,2), %xmm22 +# INTEL: vcvtph2hf8s xmm22, xmmword ptr [2*rbp - 512] 0x62,0xe5,0x7e,0x08,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff -# ATT: vcvtneph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +# ATT: vcvtph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] 0x62,0xe5,0x7e,0x8f,0x1b,0x71,0x7f -# ATT: vcvtneph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +# ATT: vcvtph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} 0x62,0xe5,0x7e,0x9f,0x1b,0x72,0x80 -# ATT: vcvtneph2hf8s (%rip){1to16}, %xmm22 -# INTEL: vcvtneph2hf8s xmm22, word ptr [rip]{1to16} +# ATT: vcvtph2hf8s (%rip){1to16}, %xmm22 +# INTEL: vcvtph2hf8s xmm22, word ptr [rip]{1to16} 0x62,0xe5,0x7e,0x38,0x1b,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2hf8sy -1024(,%rbp,2), %xmm22 -# INTEL: vcvtneph2hf8s xmm22, ymmword ptr [2*rbp - 1024] +# ATT: vcvtph2hf8sy -1024(,%rbp,2), %xmm22 +# INTEL: vcvtph2hf8s xmm22, ymmword ptr [2*rbp - 1024] 0x62,0xe5,0x7e,0x28,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff -# ATT: vcvtneph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +# ATT: vcvtph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] 0x62,0xe5,0x7e,0xaf,0x1b,0x71,0x7f -# ATT: vcvtneph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} -# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +# ATT: vcvtph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} 0x62,0xe5,0x7e,0xbf,0x1b,0x72,0x80 -# ATT: vcvtneph2hf8s 268435456(%rbp,%r14,8), %ymm22 -# INTEL: vcvtneph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +# ATT: vcvtph2hf8s 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xa5,0x7e,0x48,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 -# ATT: vcvtneph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} -# INTEL: vcvtneph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +# ATT: vcvtph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] 0x62,0xc5,0x7e,0x4f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 -# ATT: vcvtneph2hf8s (%rip){1to32}, %ymm22 -# INTEL: vcvtneph2hf8s ymm22, word ptr [rip]{1to32} +# ATT: vcvtph2hf8s (%rip){1to32}, %ymm22 +# INTEL: vcvtph2hf8s ymm22, word ptr [rip]{1to32} 0x62,0xe5,0x7e,0x58,0x1b,0x35,0x00,0x00,0x00,0x00 -# ATT: vcvtneph2hf8s -2048(,%rbp,2), %ymm22 -# INTEL: vcvtneph2hf8s ymm22, zmmword ptr [2*rbp - 2048] +# ATT: vcvtph2hf8s -2048(,%rbp,2), %ymm22 +# INTEL: vcvtph2hf8s ymm22, zmmword ptr [2*rbp - 2048] 0x62,0xe5,0x7e,0x48,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff -# ATT: vcvtneph2hf8s 8128(%rcx), %ymm22 {%k7} {z} -# INTEL: vcvtneph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +# ATT: vcvtph2hf8s 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] 0x62,0xe5,0x7e,0xcf,0x1b,0x71,0x7f -# ATT: vcvtneph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} -# INTEL: vcvtneph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +# ATT: vcvtph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} 0x62,0xe5,0x7e,0xdf,0x1b,0x72,0x80 diff --git a/llvm/test/MC/X86/avx10.2convert-32-att.s b/llvm/test/MC/X86/avx10.2convert-32-att.s index beb48245578010..940279388e6ac9 100644 --- a/llvm/test/MC/X86/avx10.2convert-32-att.s +++ b/llvm/test/MC/X86/avx10.2convert-32-att.s @@ -656,835 +656,835 @@ // CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0x52,0x80] vcvthf82ph -4096(%edx), %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0xd4] - vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 + vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0xd4] - vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} + vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0xd4] - vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0xd4] - vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 + vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0xd4] - vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} + vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0xd4] - vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0xd4] - vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 + vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0xd4] - vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} + vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0xd4] - vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 + vcvt2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + vcvt2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x58,0x74,0x10] - vcvtne2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 + vcvt2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 + vcvt2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0x51,0x7f] - vcvtne2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + vcvt2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x74,0x52,0x80] - vcvtne2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 + vcvt2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + vcvt2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf2,0x67,0x38,0x74,0x10] - vcvtne2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 + vcvt2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 + vcvt2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0x51,0x7f] - vcvtne2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + vcvt2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x74,0x52,0x80] - vcvtne2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 + vcvt2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + vcvt2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x18,0x74,0x10] - vcvtne2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 + vcvt2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 + vcvt2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0x51,0x7f] - vcvtne2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + vcvt2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x74,0x52,0x80] - vcvtne2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0xd4] - vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 + vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0xd4] - vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} + vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0xd4] - vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0xd4] - vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 + vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0xd4] - vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} + vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0xd4] - vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0xd4] - vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 + vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0xd4] - vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} + vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0xd4] - vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 + vcvt2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + vcvt2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x58,0x74,0x10] - vcvtne2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 + vcvt2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: vcvt2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 + vcvt2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 -// CHECK: vcvtne2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0x51,0x7f] - vcvtne2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + vcvt2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x74,0x52,0x80] - vcvtne2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 + vcvt2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + vcvt2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x38,0x74,0x10] - vcvtne2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 + vcvt2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: vcvt2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 + vcvt2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 -// CHECK: vcvtne2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0x51,0x7f] - vcvtne2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + vcvt2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x74,0x52,0x80] - vcvtne2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 + vcvt2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + vcvt2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x18,0x74,0x10] - vcvtne2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 + vcvt2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: vcvt2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 + vcvt2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 -// CHECK: vcvtne2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0x51,0x7f] - vcvtne2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + vcvt2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x74,0x52,0x80] - vcvtne2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0xd4] - vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 + vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0xd4] - vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} + vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0xd4] - vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0xd4] - vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 + vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0xd4] - vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} + vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0xd4] - vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0xd4] - vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 + vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0xd4] - vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} + vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0xd4] - vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 + vcvt2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + vcvt2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x58,0x18,0x10] - vcvtne2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 + vcvt2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 + vcvt2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0x51,0x7f] - vcvtne2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + vcvt2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x18,0x52,0x80] - vcvtne2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 + vcvt2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + vcvt2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x38,0x18,0x10] - vcvtne2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 + vcvt2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 + vcvt2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0x51,0x7f] - vcvtne2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + vcvt2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x18,0x52,0x80] - vcvtne2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 + vcvt2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + vcvt2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x18,0x18,0x10] - vcvtne2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 + vcvt2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 + vcvt2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0x51,0x7f] - vcvtne2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + vcvt2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x18,0x52,0x80] - vcvtne2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0xd4] - vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 + vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0xd4] - vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} + vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0xd4] - vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0xd4] - vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 + vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0xd4] - vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} + vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0xd4] - vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0xd4] - vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 + vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0xd4] - vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} + vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0xd4] - vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 + vcvt2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: vcvt2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + vcvt2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} -// CHECK: vcvtne2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x58,0x1b,0x10] - vcvtne2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 + vcvt2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: vcvt2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 + vcvt2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 -// CHECK: vcvtne2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0x51,0x7f] - vcvtne2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + vcvt2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x1b,0x52,0x80] - vcvtne2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + vcvt2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 + vcvt2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: vcvt2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + vcvt2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} -// CHECK: vcvtne2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x38,0x1b,0x10] - vcvtne2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 + vcvt2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: vcvt2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 + vcvt2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 -// CHECK: vcvtne2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0x51,0x7f] - vcvtne2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + vcvt2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x1b,0x52,0x80] - vcvtne2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + vcvt2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 + vcvt2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: vcvt2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + vcvt2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} -// CHECK: vcvtne2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x18,0x1b,0x10] - vcvtne2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 + vcvt2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: vcvt2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 + vcvt2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 -// CHECK: vcvtne2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0x51,0x7f] - vcvtne2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + vcvt2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtne2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvt2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x1b,0x52,0x80] - vcvtne2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + vcvt2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8 %xmm3, %xmm2 +// CHECK: vcvtph2bf8 %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0xd3] - vcvtneph2bf8 %xmm3, %xmm2 + vcvtph2bf8 %xmm3, %xmm2 -// CHECK: vcvtneph2bf8 %xmm3, %xmm2 {%k7} +// CHECK: vcvtph2bf8 %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0xd3] - vcvtneph2bf8 %xmm3, %xmm2 {%k7} + vcvtph2bf8 %xmm3, %xmm2 {%k7} -// CHECK: vcvtneph2bf8 %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8 %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0xd3] - vcvtneph2bf8 %xmm3, %xmm2 {%k7} {z} + vcvtph2bf8 %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8 %zmm3, %ymm2 +// CHECK: vcvtph2bf8 %zmm3, %ymm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0xd3] - vcvtneph2bf8 %zmm3, %ymm2 + vcvtph2bf8 %zmm3, %ymm2 -// CHECK: vcvtneph2bf8 %zmm3, %ymm2 {%k7} +// CHECK: vcvtph2bf8 %zmm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0xd3] - vcvtneph2bf8 %zmm3, %ymm2 {%k7} + vcvtph2bf8 %zmm3, %ymm2 {%k7} -// CHECK: vcvtneph2bf8 %zmm3, %ymm2 {%k7} {z} +// CHECK: vcvtph2bf8 %zmm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0xd3] - vcvtneph2bf8 %zmm3, %ymm2 {%k7} {z} + vcvtph2bf8 %zmm3, %ymm2 {%k7} {z} -// CHECK: vcvtneph2bf8 %ymm3, %xmm2 +// CHECK: vcvtph2bf8 %ymm3, %xmm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0xd3] - vcvtneph2bf8 %ymm3, %xmm2 + vcvtph2bf8 %ymm3, %xmm2 -// CHECK: vcvtneph2bf8 %ymm3, %xmm2 {%k7} +// CHECK: vcvtph2bf8 %ymm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x7e,0x2f,0x74,0xd3] - vcvtneph2bf8 %ymm3, %xmm2 {%k7} + vcvtph2bf8 %ymm3, %xmm2 {%k7} -// CHECK: vcvtneph2bf8 %ymm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8 %ymm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0xd3] - vcvtneph2bf8 %ymm3, %xmm2 {%k7} {z} + vcvtph2bf8 %ymm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8x 268435456(%esp,%esi,8), %xmm2 +// CHECK: vcvtph2bf8x 268435456(%esp,%esi,8), %xmm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8x 268435456(%esp,%esi,8), %xmm2 + vcvtph2bf8x 268435456(%esp,%esi,8), %xmm2 -// CHECK: vcvtneph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: vcvtph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} + vcvtph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} -// CHECK: vcvtneph2bf8 (%eax){1to8}, %xmm2 +// CHECK: vcvtph2bf8 (%eax){1to8}, %xmm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x18,0x74,0x10] - vcvtneph2bf8 (%eax){1to8}, %xmm2 + vcvtph2bf8 (%eax){1to8}, %xmm2 -// CHECK: vcvtneph2bf8x -512(,%ebp,2), %xmm2 +// CHECK: vcvtph2bf8x -512(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8x -512(,%ebp,2), %xmm2 + vcvtph2bf8x -512(,%ebp,2), %xmm2 -// CHECK: vcvtneph2bf8x 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8x 2032(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0x51,0x7f] - vcvtneph2bf8x 2032(%ecx), %xmm2 {%k7} {z} + vcvtph2bf8x 2032(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0x9f,0x74,0x52,0x80] - vcvtneph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} + vcvtph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8 (%eax){1to16}, %xmm2 +// CHECK: vcvtph2bf8 (%eax){1to16}, %xmm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x38,0x74,0x10] - vcvtneph2bf8 (%eax){1to16}, %xmm2 + vcvtph2bf8 (%eax){1to16}, %xmm2 -// CHECK: vcvtneph2bf8y -1024(,%ebp,2), %xmm2 +// CHECK: vcvtph2bf8y -1024(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8y -1024(,%ebp,2), %xmm2 + vcvtph2bf8y -1024(,%ebp,2), %xmm2 -// CHECK: vcvtneph2bf8y 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8y 4064(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0x51,0x7f] - vcvtneph2bf8y 4064(%ecx), %xmm2 {%k7} {z} + vcvtph2bf8y 4064(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0xbf,0x74,0x52,0x80] - vcvtneph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} + vcvtph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8 268435456(%esp,%esi,8), %ymm2 +// CHECK: vcvtph2bf8 268435456(%esp,%esi,8), %ymm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8 268435456(%esp,%esi,8), %ymm2 + vcvtph2bf8 268435456(%esp,%esi,8), %ymm2 -// CHECK: vcvtneph2bf8 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: vcvtph2bf8 291(%edi,%eax,4), %ymm2 {%k7} // CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8 291(%edi,%eax,4), %ymm2 {%k7} + vcvtph2bf8 291(%edi,%eax,4), %ymm2 {%k7} -// CHECK: vcvtneph2bf8 (%eax){1to32}, %ymm2 +// CHECK: vcvtph2bf8 (%eax){1to32}, %ymm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x58,0x74,0x10] - vcvtneph2bf8 (%eax){1to32}, %ymm2 + vcvtph2bf8 (%eax){1to32}, %ymm2 -// CHECK: vcvtneph2bf8 -2048(,%ebp,2), %ymm2 +// CHECK: vcvtph2bf8 -2048(,%ebp,2), %ymm2 // CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8 -2048(,%ebp,2), %ymm2 + vcvtph2bf8 -2048(,%ebp,2), %ymm2 -// CHECK: vcvtneph2bf8 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: vcvtph2bf8 8128(%ecx), %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0x51,0x7f] - vcvtneph2bf8 8128(%ecx), %ymm2 {%k7} {z} + vcvtph2bf8 8128(%ecx), %ymm2 {%k7} {z} -// CHECK: vcvtneph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: vcvtph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf2,0x7e,0xdf,0x74,0x52,0x80] - vcvtneph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} + vcvtph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} -// CHECK: vcvtneph2bf8s %xmm3, %xmm2 +// CHECK: vcvtph2bf8s %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0xd3] - vcvtneph2bf8s %xmm3, %xmm2 + vcvtph2bf8s %xmm3, %xmm2 -// CHECK: vcvtneph2bf8s %xmm3, %xmm2 {%k7} +// CHECK: vcvtph2bf8s %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0xd3] - vcvtneph2bf8s %xmm3, %xmm2 {%k7} + vcvtph2bf8s %xmm3, %xmm2 {%k7} -// CHECK: vcvtneph2bf8s %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8s %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0xd3] - vcvtneph2bf8s %xmm3, %xmm2 {%k7} {z} + vcvtph2bf8s %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8s %zmm3, %ymm2 +// CHECK: vcvtph2bf8s %zmm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0xd3] - vcvtneph2bf8s %zmm3, %ymm2 + vcvtph2bf8s %zmm3, %ymm2 -// CHECK: vcvtneph2bf8s %zmm3, %ymm2 {%k7} +// CHECK: vcvtph2bf8s %zmm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0xd3] - vcvtneph2bf8s %zmm3, %ymm2 {%k7} + vcvtph2bf8s %zmm3, %ymm2 {%k7} -// CHECK: vcvtneph2bf8s %zmm3, %ymm2 {%k7} {z} +// CHECK: vcvtph2bf8s %zmm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0xd3] - vcvtneph2bf8s %zmm3, %ymm2 {%k7} {z} + vcvtph2bf8s %zmm3, %ymm2 {%k7} {z} -// CHECK: vcvtneph2bf8s %ymm3, %xmm2 +// CHECK: vcvtph2bf8s %ymm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0xd3] - vcvtneph2bf8s %ymm3, %xmm2 + vcvtph2bf8s %ymm3, %xmm2 -// CHECK: vcvtneph2bf8s %ymm3, %xmm2 {%k7} +// CHECK: vcvtph2bf8s %ymm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x74,0xd3] - vcvtneph2bf8s %ymm3, %xmm2 {%k7} + vcvtph2bf8s %ymm3, %xmm2 {%k7} -// CHECK: vcvtneph2bf8s %ymm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8s %ymm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0xd3] - vcvtneph2bf8s %ymm3, %xmm2 {%k7} {z} + vcvtph2bf8s %ymm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8sx 268435456(%esp,%esi,8), %xmm2 +// CHECK: vcvtph2bf8sx 268435456(%esp,%esi,8), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8sx 268435456(%esp,%esi,8), %xmm2 + vcvtph2bf8sx 268435456(%esp,%esi,8), %xmm2 -// CHECK: vcvtneph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: vcvtph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} + vcvtph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} -// CHECK: vcvtneph2bf8s (%eax){1to8}, %xmm2 +// CHECK: vcvtph2bf8s (%eax){1to8}, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x74,0x10] - vcvtneph2bf8s (%eax){1to8}, %xmm2 + vcvtph2bf8s (%eax){1to8}, %xmm2 -// CHECK: vcvtneph2bf8sx -512(,%ebp,2), %xmm2 +// CHECK: vcvtph2bf8sx -512(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8sx -512(,%ebp,2), %xmm2 + vcvtph2bf8sx -512(,%ebp,2), %xmm2 -// CHECK: vcvtneph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0x51,0x7f] - vcvtneph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} + vcvtph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x74,0x52,0x80] - vcvtneph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} + vcvtph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8s (%eax){1to16}, %xmm2 +// CHECK: vcvtph2bf8s (%eax){1to16}, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x74,0x10] - vcvtneph2bf8s (%eax){1to16}, %xmm2 + vcvtph2bf8s (%eax){1to16}, %xmm2 -// CHECK: vcvtneph2bf8sy -1024(,%ebp,2), %xmm2 +// CHECK: vcvtph2bf8sy -1024(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8sy -1024(,%ebp,2), %xmm2 + vcvtph2bf8sy -1024(,%ebp,2), %xmm2 -// CHECK: vcvtneph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0x51,0x7f] - vcvtneph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} + vcvtph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: vcvtph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x74,0x52,0x80] - vcvtneph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} + vcvtph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2bf8s 268435456(%esp,%esi,8), %ymm2 +// CHECK: vcvtph2bf8s 268435456(%esp,%esi,8), %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8s 268435456(%esp,%esi,8), %ymm2 + vcvtph2bf8s 268435456(%esp,%esi,8), %ymm2 -// CHECK: vcvtneph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: vcvtph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} + vcvtph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} -// CHECK: vcvtneph2bf8s (%eax){1to32}, %ymm2 +// CHECK: vcvtph2bf8s (%eax){1to32}, %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x74,0x10] - vcvtneph2bf8s (%eax){1to32}, %ymm2 + vcvtph2bf8s (%eax){1to32}, %ymm2 -// CHECK: vcvtneph2bf8s -2048(,%ebp,2), %ymm2 +// CHECK: vcvtph2bf8s -2048(,%ebp,2), %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8s -2048(,%ebp,2), %ymm2 + vcvtph2bf8s -2048(,%ebp,2), %ymm2 -// CHECK: vcvtneph2bf8s 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: vcvtph2bf8s 8128(%ecx), %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0x51,0x7f] - vcvtneph2bf8s 8128(%ecx), %ymm2 {%k7} {z} + vcvtph2bf8s 8128(%ecx), %ymm2 {%k7} {z} -// CHECK: vcvtneph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: vcvtph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x74,0x52,0x80] - vcvtneph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} + vcvtph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} -// CHECK: vcvtneph2hf8 %xmm3, %xmm2 +// CHECK: vcvtph2hf8 %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0xd3] - vcvtneph2hf8 %xmm3, %xmm2 + vcvtph2hf8 %xmm3, %xmm2 -// CHECK: vcvtneph2hf8 %xmm3, %xmm2 {%k7} +// CHECK: vcvtph2hf8 %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0xd3] - vcvtneph2hf8 %xmm3, %xmm2 {%k7} + vcvtph2hf8 %xmm3, %xmm2 {%k7} -// CHECK: vcvtneph2hf8 %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8 %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0xd3] - vcvtneph2hf8 %xmm3, %xmm2 {%k7} {z} + vcvtph2hf8 %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8 %zmm3, %ymm2 +// CHECK: vcvtph2hf8 %zmm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0xd3] - vcvtneph2hf8 %zmm3, %ymm2 + vcvtph2hf8 %zmm3, %ymm2 -// CHECK: vcvtneph2hf8 %zmm3, %ymm2 {%k7} +// CHECK: vcvtph2hf8 %zmm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0xd3] - vcvtneph2hf8 %zmm3, %ymm2 {%k7} + vcvtph2hf8 %zmm3, %ymm2 {%k7} -// CHECK: vcvtneph2hf8 %zmm3, %ymm2 {%k7} {z} +// CHECK: vcvtph2hf8 %zmm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0xd3] - vcvtneph2hf8 %zmm3, %ymm2 {%k7} {z} + vcvtph2hf8 %zmm3, %ymm2 {%k7} {z} -// CHECK: vcvtneph2hf8 %ymm3, %xmm2 +// CHECK: vcvtph2hf8 %ymm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0xd3] - vcvtneph2hf8 %ymm3, %xmm2 + vcvtph2hf8 %ymm3, %xmm2 -// CHECK: vcvtneph2hf8 %ymm3, %xmm2 {%k7} +// CHECK: vcvtph2hf8 %ymm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x18,0xd3] - vcvtneph2hf8 %ymm3, %xmm2 {%k7} + vcvtph2hf8 %ymm3, %xmm2 {%k7} -// CHECK: vcvtneph2hf8 %ymm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8 %ymm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0xd3] - vcvtneph2hf8 %ymm3, %xmm2 {%k7} {z} + vcvtph2hf8 %ymm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8x 268435456(%esp,%esi,8), %xmm2 +// CHECK: vcvtph2hf8x 268435456(%esp,%esi,8), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8x 268435456(%esp,%esi,8), %xmm2 + vcvtph2hf8x 268435456(%esp,%esi,8), %xmm2 -// CHECK: vcvtneph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: vcvtph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} + vcvtph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} -// CHECK: vcvtneph2hf8 (%eax){1to8}, %xmm2 +// CHECK: vcvtph2hf8 (%eax){1to8}, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x18,0x10] - vcvtneph2hf8 (%eax){1to8}, %xmm2 + vcvtph2hf8 (%eax){1to8}, %xmm2 -// CHECK: vcvtneph2hf8x -512(,%ebp,2), %xmm2 +// CHECK: vcvtph2hf8x -512(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8x -512(,%ebp,2), %xmm2 + vcvtph2hf8x -512(,%ebp,2), %xmm2 -// CHECK: vcvtneph2hf8x 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8x 2032(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0x51,0x7f] - vcvtneph2hf8x 2032(%ecx), %xmm2 {%k7} {z} + vcvtph2hf8x 2032(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x18,0x52,0x80] - vcvtneph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} + vcvtph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8 (%eax){1to16}, %xmm2 +// CHECK: vcvtph2hf8 (%eax){1to16}, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x18,0x10] - vcvtneph2hf8 (%eax){1to16}, %xmm2 + vcvtph2hf8 (%eax){1to16}, %xmm2 -// CHECK: vcvtneph2hf8y -1024(,%ebp,2), %xmm2 +// CHECK: vcvtph2hf8y -1024(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8y -1024(,%ebp,2), %xmm2 + vcvtph2hf8y -1024(,%ebp,2), %xmm2 -// CHECK: vcvtneph2hf8y 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8y 4064(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0x51,0x7f] - vcvtneph2hf8y 4064(%ecx), %xmm2 {%k7} {z} + vcvtph2hf8y 4064(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x18,0x52,0x80] - vcvtneph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} + vcvtph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8 268435456(%esp,%esi,8), %ymm2 +// CHECK: vcvtph2hf8 268435456(%esp,%esi,8), %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8 268435456(%esp,%esi,8), %ymm2 + vcvtph2hf8 268435456(%esp,%esi,8), %ymm2 -// CHECK: vcvtneph2hf8 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: vcvtph2hf8 291(%edi,%eax,4), %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8 291(%edi,%eax,4), %ymm2 {%k7} + vcvtph2hf8 291(%edi,%eax,4), %ymm2 {%k7} -// CHECK: vcvtneph2hf8 (%eax){1to32}, %ymm2 +// CHECK: vcvtph2hf8 (%eax){1to32}, %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x18,0x10] - vcvtneph2hf8 (%eax){1to32}, %ymm2 + vcvtph2hf8 (%eax){1to32}, %ymm2 -// CHECK: vcvtneph2hf8 -2048(,%ebp,2), %ymm2 +// CHECK: vcvtph2hf8 -2048(,%ebp,2), %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8 -2048(,%ebp,2), %ymm2 + vcvtph2hf8 -2048(,%ebp,2), %ymm2 -// CHECK: vcvtneph2hf8 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: vcvtph2hf8 8128(%ecx), %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0x51,0x7f] - vcvtneph2hf8 8128(%ecx), %ymm2 {%k7} {z} + vcvtph2hf8 8128(%ecx), %ymm2 {%k7} {z} -// CHECK: vcvtneph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: vcvtph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x18,0x52,0x80] - vcvtneph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} + vcvtph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} -// CHECK: vcvtneph2hf8s %xmm3, %xmm2 +// CHECK: vcvtph2hf8s %xmm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0xd3] - vcvtneph2hf8s %xmm3, %xmm2 + vcvtph2hf8s %xmm3, %xmm2 -// CHECK: vcvtneph2hf8s %xmm3, %xmm2 {%k7} +// CHECK: vcvtph2hf8s %xmm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0xd3] - vcvtneph2hf8s %xmm3, %xmm2 {%k7} + vcvtph2hf8s %xmm3, %xmm2 {%k7} -// CHECK: vcvtneph2hf8s %xmm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8s %xmm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0xd3] - vcvtneph2hf8s %xmm3, %xmm2 {%k7} {z} + vcvtph2hf8s %xmm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8s %zmm3, %ymm2 +// CHECK: vcvtph2hf8s %zmm3, %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0xd3] - vcvtneph2hf8s %zmm3, %ymm2 + vcvtph2hf8s %zmm3, %ymm2 -// CHECK: vcvtneph2hf8s %zmm3, %ymm2 {%k7} +// CHECK: vcvtph2hf8s %zmm3, %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0xd3] - vcvtneph2hf8s %zmm3, %ymm2 {%k7} + vcvtph2hf8s %zmm3, %ymm2 {%k7} -// CHECK: vcvtneph2hf8s %zmm3, %ymm2 {%k7} {z} +// CHECK: vcvtph2hf8s %zmm3, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0xd3] - vcvtneph2hf8s %zmm3, %ymm2 {%k7} {z} + vcvtph2hf8s %zmm3, %ymm2 {%k7} {z} -// CHECK: vcvtneph2hf8s %ymm3, %xmm2 +// CHECK: vcvtph2hf8s %ymm3, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0xd3] - vcvtneph2hf8s %ymm3, %xmm2 + vcvtph2hf8s %ymm3, %xmm2 -// CHECK: vcvtneph2hf8s %ymm3, %xmm2 {%k7} +// CHECK: vcvtph2hf8s %ymm3, %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x1b,0xd3] - vcvtneph2hf8s %ymm3, %xmm2 {%k7} + vcvtph2hf8s %ymm3, %xmm2 {%k7} -// CHECK: vcvtneph2hf8s %ymm3, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8s %ymm3, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0xd3] - vcvtneph2hf8s %ymm3, %xmm2 {%k7} {z} + vcvtph2hf8s %ymm3, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8sx 268435456(%esp,%esi,8), %xmm2 +// CHECK: vcvtph2hf8sx 268435456(%esp,%esi,8), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8sx 268435456(%esp,%esi,8), %xmm2 + vcvtph2hf8sx 268435456(%esp,%esi,8), %xmm2 -// CHECK: vcvtneph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: vcvtph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} + vcvtph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} -// CHECK: vcvtneph2hf8s (%eax){1to8}, %xmm2 +// CHECK: vcvtph2hf8s (%eax){1to8}, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x1b,0x10] - vcvtneph2hf8s (%eax){1to8}, %xmm2 + vcvtph2hf8s (%eax){1to8}, %xmm2 -// CHECK: vcvtneph2hf8sx -512(,%ebp,2), %xmm2 +// CHECK: vcvtph2hf8sx -512(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8sx -512(,%ebp,2), %xmm2 + vcvtph2hf8sx -512(,%ebp,2), %xmm2 -// CHECK: vcvtneph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0x51,0x7f] - vcvtneph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} + vcvtph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x1b,0x52,0x80] - vcvtneph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} + vcvtph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8s (%eax){1to16}, %xmm2 +// CHECK: vcvtph2hf8s (%eax){1to16}, %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x1b,0x10] - vcvtneph2hf8s (%eax){1to16}, %xmm2 + vcvtph2hf8s (%eax){1to16}, %xmm2 -// CHECK: vcvtneph2hf8sy -1024(,%ebp,2), %xmm2 +// CHECK: vcvtph2hf8sy -1024(,%ebp,2), %xmm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8sy -1024(,%ebp,2), %xmm2 + vcvtph2hf8sy -1024(,%ebp,2), %xmm2 -// CHECK: vcvtneph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0x51,0x7f] - vcvtneph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} + vcvtph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: vcvtph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x1b,0x52,0x80] - vcvtneph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} + vcvtph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} -// CHECK: vcvtneph2hf8s 268435456(%esp,%esi,8), %ymm2 +// CHECK: vcvtph2hf8s 268435456(%esp,%esi,8), %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8s 268435456(%esp,%esi,8), %ymm2 + vcvtph2hf8s 268435456(%esp,%esi,8), %ymm2 -// CHECK: vcvtneph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: vcvtph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} + vcvtph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} -// CHECK: vcvtneph2hf8s (%eax){1to32}, %ymm2 +// CHECK: vcvtph2hf8s (%eax){1to32}, %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x1b,0x10] - vcvtneph2hf8s (%eax){1to32}, %ymm2 + vcvtph2hf8s (%eax){1to32}, %ymm2 -// CHECK: vcvtneph2hf8s -2048(,%ebp,2), %ymm2 +// CHECK: vcvtph2hf8s -2048(,%ebp,2), %ymm2 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8s -2048(,%ebp,2), %ymm2 + vcvtph2hf8s -2048(,%ebp,2), %ymm2 -// CHECK: vcvtneph2hf8s 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: vcvtph2hf8s 8128(%ecx), %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0x51,0x7f] - vcvtneph2hf8s 8128(%ecx), %ymm2 {%k7} {z} + vcvtph2hf8s 8128(%ecx), %ymm2 {%k7} {z} -// CHECK: vcvtneph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: vcvtph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x1b,0x52,0x80] - vcvtneph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} + vcvtph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} diff --git a/llvm/test/MC/X86/avx10.2convert-32-intel.s b/llvm/test/MC/X86/avx10.2convert-32-intel.s index 493cdae7a64259..52a02f7ff963c3 100644 --- a/llvm/test/MC/X86/avx10.2convert-32-intel.s +++ b/llvm/test/MC/X86/avx10.2convert-32-intel.s @@ -656,835 +656,835 @@ // CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0x52,0x80] vcvthf82ph zmm2 {k7} {z}, ymmword ptr [edx - 4096] -// CHECK: vcvtne2ph2bf8 ymm2, ymm3, ymm4 +// CHECK: vcvt2ph2bf8 ymm2, ymm3, ymm4 // CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0xd4] - vcvtne2ph2bf8 ymm2, ymm3, ymm4 + vcvt2ph2bf8 ymm2, ymm3, ymm4 -// CHECK: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymm4 +// CHECK: vcvt2ph2bf8 ymm2 {k7}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0xd4] - vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymm4 + vcvt2ph2bf8 ymm2 {k7}, ymm3, ymm4 -// CHECK: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0xd4] - vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 + vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 -// CHECK: vcvtne2ph2bf8 zmm2, zmm3, zmm4 +// CHECK: vcvt2ph2bf8 zmm2, zmm3, zmm4 // CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0xd4] - vcvtne2ph2bf8 zmm2, zmm3, zmm4 + vcvt2ph2bf8 zmm2, zmm3, zmm4 -// CHECK: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmm4 +// CHECK: vcvt2ph2bf8 zmm2 {k7}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0xd4] - vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmm4 + vcvt2ph2bf8 zmm2 {k7}, zmm3, zmm4 -// CHECK: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0xd4] - vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 + vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 -// CHECK: vcvtne2ph2bf8 xmm2, xmm3, xmm4 +// CHECK: vcvt2ph2bf8 xmm2, xmm3, xmm4 // CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0xd4] - vcvtne2ph2bf8 xmm2, xmm3, xmm4 + vcvt2ph2bf8 xmm2, xmm3, xmm4 -// CHECK: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmm4 +// CHECK: vcvt2ph2bf8 xmm2 {k7}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0xd4] - vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmm4 + vcvt2ph2bf8 xmm2 {k7}, xmm3, xmm4 -// CHECK: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0xd4] - vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 + vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 -// CHECK: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + vcvt2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: vcvt2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf2,0x67,0x58,0x74,0x10] - vcvtne2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} + vcvt2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} -// CHECK: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: vcvt2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + vcvt2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0x51,0x7f] - vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] -// CHECK: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x74,0x52,0x80] - vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + vcvt2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} -// CHECK: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + vcvt2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + vcvt2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: vcvt2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf2,0x67,0x38,0x74,0x10] - vcvtne2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} + vcvt2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} -// CHECK: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: vcvt2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + vcvt2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0x51,0x7f] - vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] -// CHECK: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x74,0x52,0x80] - vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + vcvt2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} -// CHECK: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + vcvt2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: vcvt2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf2,0x67,0x18,0x74,0x10] - vcvtne2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} + vcvt2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} -// CHECK: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: vcvt2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] + vcvt2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] -// CHECK: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0x51,0x7f] - vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] -// CHECK: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x74,0x52,0x80] - vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + vcvt2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} -// CHECK: vcvtne2ph2bf8s ymm2, ymm3, ymm4 +// CHECK: vcvt2ph2bf8s ymm2, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0xd4] - vcvtne2ph2bf8s ymm2, ymm3, ymm4 + vcvt2ph2bf8s ymm2, ymm3, ymm4 -// CHECK: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymm4 +// CHECK: vcvt2ph2bf8s ymm2 {k7}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0xd4] - vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymm4 + vcvt2ph2bf8s ymm2 {k7}, ymm3, ymm4 -// CHECK: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0xd4] - vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 + vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 -// CHECK: vcvtne2ph2bf8s zmm2, zmm3, zmm4 +// CHECK: vcvt2ph2bf8s zmm2, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0xd4] - vcvtne2ph2bf8s zmm2, zmm3, zmm4 + vcvt2ph2bf8s zmm2, zmm3, zmm4 -// CHECK: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmm4 +// CHECK: vcvt2ph2bf8s zmm2 {k7}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0xd4] - vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmm4 + vcvt2ph2bf8s zmm2 {k7}, zmm3, zmm4 -// CHECK: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0xd4] - vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 + vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 -// CHECK: vcvtne2ph2bf8s xmm2, xmm3, xmm4 +// CHECK: vcvt2ph2bf8s xmm2, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0xd4] - vcvtne2ph2bf8s xmm2, xmm3, xmm4 + vcvt2ph2bf8s xmm2, xmm3, xmm4 -// CHECK: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmm4 +// CHECK: vcvt2ph2bf8s xmm2 {k7}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0xd4] - vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmm4 + vcvt2ph2bf8s xmm2 {k7}, xmm3, xmm4 -// CHECK: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0xd4] - vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 + vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 -// CHECK: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + vcvt2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: vcvt2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf5,0x67,0x58,0x74,0x10] - vcvtne2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} + vcvt2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} -// CHECK: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: vcvt2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] + vcvt2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0x51,0x7f] - vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] -// CHECK: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x74,0x52,0x80] - vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + vcvt2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} -// CHECK: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + vcvt2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + vcvt2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: vcvt2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf5,0x67,0x38,0x74,0x10] - vcvtne2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} + vcvt2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} -// CHECK: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: vcvt2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] + vcvt2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0x51,0x7f] - vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] -// CHECK: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x74,0x52,0x80] - vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + vcvt2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} -// CHECK: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + vcvt2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: vcvt2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf5,0x67,0x18,0x74,0x10] - vcvtne2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} + vcvt2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} -// CHECK: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: vcvt2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] + vcvt2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] -// CHECK: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0x51,0x7f] - vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] -// CHECK: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x74,0x52,0x80] - vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + vcvt2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} -// CHECK: vcvtne2ph2hf8 ymm2, ymm3, ymm4 +// CHECK: vcvt2ph2hf8 ymm2, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0xd4] - vcvtne2ph2hf8 ymm2, ymm3, ymm4 + vcvt2ph2hf8 ymm2, ymm3, ymm4 -// CHECK: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymm4 +// CHECK: vcvt2ph2hf8 ymm2 {k7}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0xd4] - vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymm4 + vcvt2ph2hf8 ymm2 {k7}, ymm3, ymm4 -// CHECK: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0xd4] - vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 + vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 -// CHECK: vcvtne2ph2hf8 zmm2, zmm3, zmm4 +// CHECK: vcvt2ph2hf8 zmm2, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0xd4] - vcvtne2ph2hf8 zmm2, zmm3, zmm4 + vcvt2ph2hf8 zmm2, zmm3, zmm4 -// CHECK: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmm4 +// CHECK: vcvt2ph2hf8 zmm2 {k7}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0xd4] - vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmm4 + vcvt2ph2hf8 zmm2 {k7}, zmm3, zmm4 -// CHECK: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0xd4] - vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 + vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 -// CHECK: vcvtne2ph2hf8 xmm2, xmm3, xmm4 +// CHECK: vcvt2ph2hf8 xmm2, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0xd4] - vcvtne2ph2hf8 xmm2, xmm3, xmm4 + vcvt2ph2hf8 xmm2, xmm3, xmm4 -// CHECK: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmm4 +// CHECK: vcvt2ph2hf8 xmm2 {k7}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0xd4] - vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmm4 + vcvt2ph2hf8 xmm2 {k7}, xmm3, xmm4 -// CHECK: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0xd4] - vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 + vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 -// CHECK: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + vcvt2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: vcvt2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf5,0x67,0x58,0x18,0x10] - vcvtne2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} + vcvt2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} -// CHECK: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: vcvt2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + vcvt2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0x51,0x7f] - vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] -// CHECK: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x18,0x52,0x80] - vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + vcvt2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} -// CHECK: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + vcvt2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + vcvt2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: vcvt2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf5,0x67,0x38,0x18,0x10] - vcvtne2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} + vcvt2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} -// CHECK: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: vcvt2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + vcvt2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0x51,0x7f] - vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] -// CHECK: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x18,0x52,0x80] - vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + vcvt2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} -// CHECK: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + vcvt2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: vcvt2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf5,0x67,0x18,0x18,0x10] - vcvtne2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} + vcvt2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} -// CHECK: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: vcvt2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] + vcvt2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] -// CHECK: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0x51,0x7f] - vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] -// CHECK: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x18,0x52,0x80] - vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + vcvt2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} -// CHECK: vcvtne2ph2hf8s ymm2, ymm3, ymm4 +// CHECK: vcvt2ph2hf8s ymm2, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0xd4] - vcvtne2ph2hf8s ymm2, ymm3, ymm4 + vcvt2ph2hf8s ymm2, ymm3, ymm4 -// CHECK: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymm4 +// CHECK: vcvt2ph2hf8s ymm2 {k7}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0xd4] - vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymm4 + vcvt2ph2hf8s ymm2 {k7}, ymm3, ymm4 -// CHECK: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0xd4] - vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 + vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 -// CHECK: vcvtne2ph2hf8s zmm2, zmm3, zmm4 +// CHECK: vcvt2ph2hf8s zmm2, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0xd4] - vcvtne2ph2hf8s zmm2, zmm3, zmm4 + vcvt2ph2hf8s zmm2, zmm3, zmm4 -// CHECK: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmm4 +// CHECK: vcvt2ph2hf8s zmm2 {k7}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0xd4] - vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmm4 + vcvt2ph2hf8s zmm2 {k7}, zmm3, zmm4 -// CHECK: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0xd4] - vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 + vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 -// CHECK: vcvtne2ph2hf8s xmm2, xmm3, xmm4 +// CHECK: vcvt2ph2hf8s xmm2, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0xd4] - vcvtne2ph2hf8s xmm2, xmm3, xmm4 + vcvt2ph2hf8s xmm2, xmm3, xmm4 -// CHECK: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmm4 +// CHECK: vcvt2ph2hf8s xmm2 {k7}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0xd4] - vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmm4 + vcvt2ph2hf8s xmm2 {k7}, xmm3, xmm4 -// CHECK: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0xd4] - vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 + vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 -// CHECK: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + vcvt2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: vcvt2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf5,0x67,0x58,0x1b,0x10] - vcvtne2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} + vcvt2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} -// CHECK: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: vcvt2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] + vcvt2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0x51,0x7f] - vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] -// CHECK: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x1b,0x52,0x80] - vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + vcvt2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} -// CHECK: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + vcvt2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + vcvt2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: vcvt2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf5,0x67,0x38,0x1b,0x10] - vcvtne2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} + vcvt2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} -// CHECK: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: vcvt2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] + vcvt2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0x51,0x7f] - vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] -// CHECK: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x1b,0x52,0x80] - vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + vcvt2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} -// CHECK: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvt2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + vcvt2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvt2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + vcvt2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtne2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: vcvt2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf5,0x67,0x18,0x1b,0x10] - vcvtne2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} + vcvt2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} -// CHECK: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: vcvt2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] + vcvt2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] -// CHECK: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0x51,0x7f] - vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] -// CHECK: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x1b,0x52,0x80] - vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + vcvt2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} -// CHECK: vcvtneph2bf8 xmm2, xmm3 +// CHECK: vcvtph2bf8 xmm2, xmm3 // CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0xd3] - vcvtneph2bf8 xmm2, xmm3 + vcvtph2bf8 xmm2, xmm3 -// CHECK: vcvtneph2bf8 xmm2 {k7}, xmm3 +// CHECK: vcvtph2bf8 xmm2 {k7}, xmm3 // CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0xd3] - vcvtneph2bf8 xmm2 {k7}, xmm3 + vcvtph2bf8 xmm2 {k7}, xmm3 -// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, xmm3 +// CHECK: vcvtph2bf8 xmm2 {k7} {z}, xmm3 // CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0xd3] - vcvtneph2bf8 xmm2 {k7} {z}, xmm3 + vcvtph2bf8 xmm2 {k7} {z}, xmm3 -// CHECK: vcvtneph2bf8 ymm2, zmm3 +// CHECK: vcvtph2bf8 ymm2, zmm3 // CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0xd3] - vcvtneph2bf8 ymm2, zmm3 + vcvtph2bf8 ymm2, zmm3 -// CHECK: vcvtneph2bf8 ymm2 {k7}, zmm3 +// CHECK: vcvtph2bf8 ymm2 {k7}, zmm3 // CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0xd3] - vcvtneph2bf8 ymm2 {k7}, zmm3 + vcvtph2bf8 ymm2 {k7}, zmm3 -// CHECK: vcvtneph2bf8 ymm2 {k7} {z}, zmm3 +// CHECK: vcvtph2bf8 ymm2 {k7} {z}, zmm3 // CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0xd3] - vcvtneph2bf8 ymm2 {k7} {z}, zmm3 + vcvtph2bf8 ymm2 {k7} {z}, zmm3 -// CHECK: vcvtneph2bf8 xmm2, ymm3 +// CHECK: vcvtph2bf8 xmm2, ymm3 // CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0xd3] - vcvtneph2bf8 xmm2, ymm3 + vcvtph2bf8 xmm2, ymm3 -// CHECK: vcvtneph2bf8 xmm2 {k7}, ymm3 +// CHECK: vcvtph2bf8 xmm2 {k7}, ymm3 // CHECK: encoding: [0x62,0xf2,0x7e,0x2f,0x74,0xd3] - vcvtneph2bf8 xmm2 {k7}, ymm3 + vcvtph2bf8 xmm2 {k7}, ymm3 -// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, ymm3 +// CHECK: vcvtph2bf8 xmm2 {k7} {z}, ymm3 // CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0xd3] - vcvtneph2bf8 xmm2 {k7} {z}, ymm3 + vcvtph2bf8 xmm2 {k7} {z}, ymm3 -// CHECK: vcvtneph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] + vcvtph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + vcvtph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2bf8 xmm2, word ptr [eax]{1to8} +// CHECK: vcvtph2bf8 xmm2, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf2,0x7e,0x18,0x74,0x10] - vcvtneph2bf8 xmm2, word ptr [eax]{1to8} + vcvtph2bf8 xmm2, word ptr [eax]{1to8} -// CHECK: vcvtneph2bf8 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: vcvtph2bf8 xmm2, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8 xmm2, xmmword ptr [2*ebp - 512] + vcvtph2bf8 xmm2, xmmword ptr [2*ebp - 512] -// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: vcvtph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0x51,0x7f] - vcvtneph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + vcvtph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] -// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: vcvtph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf2,0x7e,0x9f,0x74,0x52,0x80] - vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + vcvtph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} -// CHECK: vcvtneph2bf8 xmm2, word ptr [eax]{1to16} +// CHECK: vcvtph2bf8 xmm2, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf2,0x7e,0x38,0x74,0x10] - vcvtneph2bf8 xmm2, word ptr [eax]{1to16} + vcvtph2bf8 xmm2, word ptr [eax]{1to16} -// CHECK: vcvtneph2bf8 xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: vcvtph2bf8 xmm2, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8 xmm2, ymmword ptr [2*ebp - 1024] + vcvtph2bf8 xmm2, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: vcvtph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0x51,0x7f] - vcvtneph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + vcvtph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] -// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: vcvtph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf2,0x7e,0xbf,0x74,0x52,0x80] - vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + vcvtph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} -// CHECK: vcvtneph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] + vcvtph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + vcvtph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2bf8 ymm2, word ptr [eax]{1to32} +// CHECK: vcvtph2bf8 ymm2, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf2,0x7e,0x58,0x74,0x10] - vcvtneph2bf8 ymm2, word ptr [eax]{1to32} + vcvtph2bf8 ymm2, word ptr [eax]{1to32} -// CHECK: vcvtneph2bf8 ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: vcvtph2bf8 ymm2, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8 ymm2, zmmword ptr [2*ebp - 2048] + vcvtph2bf8 ymm2, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtneph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: vcvtph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0x51,0x7f] - vcvtneph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + vcvtph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] -// CHECK: vcvtneph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: vcvtph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf2,0x7e,0xdf,0x74,0x52,0x80] - vcvtneph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + vcvtph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} -// CHECK: vcvtneph2bf8s xmm2, xmm3 +// CHECK: vcvtph2bf8s xmm2, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0xd3] - vcvtneph2bf8s xmm2, xmm3 + vcvtph2bf8s xmm2, xmm3 -// CHECK: vcvtneph2bf8s xmm2 {k7}, xmm3 +// CHECK: vcvtph2bf8s xmm2 {k7}, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0xd3] - vcvtneph2bf8s xmm2 {k7}, xmm3 + vcvtph2bf8s xmm2 {k7}, xmm3 -// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, xmm3 +// CHECK: vcvtph2bf8s xmm2 {k7} {z}, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0xd3] - vcvtneph2bf8s xmm2 {k7} {z}, xmm3 + vcvtph2bf8s xmm2 {k7} {z}, xmm3 -// CHECK: vcvtneph2bf8s ymm2, zmm3 +// CHECK: vcvtph2bf8s ymm2, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0xd3] - vcvtneph2bf8s ymm2, zmm3 + vcvtph2bf8s ymm2, zmm3 -// CHECK: vcvtneph2bf8s ymm2 {k7}, zmm3 +// CHECK: vcvtph2bf8s ymm2 {k7}, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0xd3] - vcvtneph2bf8s ymm2 {k7}, zmm3 + vcvtph2bf8s ymm2 {k7}, zmm3 -// CHECK: vcvtneph2bf8s ymm2 {k7} {z}, zmm3 +// CHECK: vcvtph2bf8s ymm2 {k7} {z}, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0xd3] - vcvtneph2bf8s ymm2 {k7} {z}, zmm3 + vcvtph2bf8s ymm2 {k7} {z}, zmm3 -// CHECK: vcvtneph2bf8s xmm2, ymm3 +// CHECK: vcvtph2bf8s xmm2, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0xd3] - vcvtneph2bf8s xmm2, ymm3 + vcvtph2bf8s xmm2, ymm3 -// CHECK: vcvtneph2bf8s xmm2 {k7}, ymm3 +// CHECK: vcvtph2bf8s xmm2 {k7}, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x74,0xd3] - vcvtneph2bf8s xmm2 {k7}, ymm3 + vcvtph2bf8s xmm2 {k7}, ymm3 -// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, ymm3 +// CHECK: vcvtph2bf8s xmm2 {k7} {z}, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0xd3] - vcvtneph2bf8s xmm2 {k7} {z}, ymm3 + vcvtph2bf8s xmm2 {k7} {z}, ymm3 -// CHECK: vcvtneph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] + vcvtph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + vcvtph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2bf8s xmm2, word ptr [eax]{1to8} +// CHECK: vcvtph2bf8s xmm2, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x74,0x10] - vcvtneph2bf8s xmm2, word ptr [eax]{1to8} + vcvtph2bf8s xmm2, word ptr [eax]{1to8} -// CHECK: vcvtneph2bf8s xmm2, xmmword ptr [2*ebp - 512] +// CHECK: vcvtph2bf8s xmm2, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8s xmm2, xmmword ptr [2*ebp - 512] + vcvtph2bf8s xmm2, xmmword ptr [2*ebp - 512] -// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: vcvtph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0x51,0x7f] - vcvtneph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + vcvtph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] -// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: vcvtph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x74,0x52,0x80] - vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + vcvtph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} -// CHECK: vcvtneph2bf8s xmm2, word ptr [eax]{1to16} +// CHECK: vcvtph2bf8s xmm2, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x74,0x10] - vcvtneph2bf8s xmm2, word ptr [eax]{1to16} + vcvtph2bf8s xmm2, word ptr [eax]{1to16} -// CHECK: vcvtneph2bf8s xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: vcvtph2bf8s xmm2, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8s xmm2, ymmword ptr [2*ebp - 1024] + vcvtph2bf8s xmm2, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: vcvtph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0x51,0x7f] - vcvtneph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + vcvtph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] -// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: vcvtph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x74,0x52,0x80] - vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + vcvtph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} -// CHECK: vcvtneph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] + vcvtph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + vcvtph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2bf8s ymm2, word ptr [eax]{1to32} +// CHECK: vcvtph2bf8s ymm2, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x74,0x10] - vcvtneph2bf8s ymm2, word ptr [eax]{1to32} + vcvtph2bf8s ymm2, word ptr [eax]{1to32} -// CHECK: vcvtneph2bf8s ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: vcvtph2bf8s ymm2, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8s ymm2, zmmword ptr [2*ebp - 2048] + vcvtph2bf8s ymm2, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtneph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: vcvtph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0x51,0x7f] - vcvtneph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + vcvtph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] -// CHECK: vcvtneph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: vcvtph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x74,0x52,0x80] - vcvtneph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + vcvtph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} -// CHECK: vcvtneph2hf8 xmm2, xmm3 +// CHECK: vcvtph2hf8 xmm2, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0xd3] - vcvtneph2hf8 xmm2, xmm3 + vcvtph2hf8 xmm2, xmm3 -// CHECK: vcvtneph2hf8 xmm2 {k7}, xmm3 +// CHECK: vcvtph2hf8 xmm2 {k7}, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0xd3] - vcvtneph2hf8 xmm2 {k7}, xmm3 + vcvtph2hf8 xmm2 {k7}, xmm3 -// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, xmm3 +// CHECK: vcvtph2hf8 xmm2 {k7} {z}, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0xd3] - vcvtneph2hf8 xmm2 {k7} {z}, xmm3 + vcvtph2hf8 xmm2 {k7} {z}, xmm3 -// CHECK: vcvtneph2hf8 ymm2, zmm3 +// CHECK: vcvtph2hf8 ymm2, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0xd3] - vcvtneph2hf8 ymm2, zmm3 + vcvtph2hf8 ymm2, zmm3 -// CHECK: vcvtneph2hf8 ymm2 {k7}, zmm3 +// CHECK: vcvtph2hf8 ymm2 {k7}, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0xd3] - vcvtneph2hf8 ymm2 {k7}, zmm3 + vcvtph2hf8 ymm2 {k7}, zmm3 -// CHECK: vcvtneph2hf8 ymm2 {k7} {z}, zmm3 +// CHECK: vcvtph2hf8 ymm2 {k7} {z}, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0xd3] - vcvtneph2hf8 ymm2 {k7} {z}, zmm3 + vcvtph2hf8 ymm2 {k7} {z}, zmm3 -// CHECK: vcvtneph2hf8 xmm2, ymm3 +// CHECK: vcvtph2hf8 xmm2, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0xd3] - vcvtneph2hf8 xmm2, ymm3 + vcvtph2hf8 xmm2, ymm3 -// CHECK: vcvtneph2hf8 xmm2 {k7}, ymm3 +// CHECK: vcvtph2hf8 xmm2 {k7}, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x18,0xd3] - vcvtneph2hf8 xmm2 {k7}, ymm3 + vcvtph2hf8 xmm2 {k7}, ymm3 -// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, ymm3 +// CHECK: vcvtph2hf8 xmm2 {k7} {z}, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0xd3] - vcvtneph2hf8 xmm2 {k7} {z}, ymm3 + vcvtph2hf8 xmm2 {k7} {z}, ymm3 -// CHECK: vcvtneph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] + vcvtph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + vcvtph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2hf8 xmm2, word ptr [eax]{1to8} +// CHECK: vcvtph2hf8 xmm2, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x18,0x10] - vcvtneph2hf8 xmm2, word ptr [eax]{1to8} + vcvtph2hf8 xmm2, word ptr [eax]{1to8} -// CHECK: vcvtneph2hf8 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: vcvtph2hf8 xmm2, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8 xmm2, xmmword ptr [2*ebp - 512] + vcvtph2hf8 xmm2, xmmword ptr [2*ebp - 512] -// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: vcvtph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0x51,0x7f] - vcvtneph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + vcvtph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] -// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: vcvtph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x18,0x52,0x80] - vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + vcvtph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} -// CHECK: vcvtneph2hf8 xmm2, word ptr [eax]{1to16} +// CHECK: vcvtph2hf8 xmm2, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x18,0x10] - vcvtneph2hf8 xmm2, word ptr [eax]{1to16} + vcvtph2hf8 xmm2, word ptr [eax]{1to16} -// CHECK: vcvtneph2hf8 xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: vcvtph2hf8 xmm2, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8 xmm2, ymmword ptr [2*ebp - 1024] + vcvtph2hf8 xmm2, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: vcvtph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0x51,0x7f] - vcvtneph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + vcvtph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] -// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: vcvtph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x18,0x52,0x80] - vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + vcvtph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} -// CHECK: vcvtneph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] + vcvtph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + vcvtph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2hf8 ymm2, word ptr [eax]{1to32} +// CHECK: vcvtph2hf8 ymm2, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x18,0x10] - vcvtneph2hf8 ymm2, word ptr [eax]{1to32} + vcvtph2hf8 ymm2, word ptr [eax]{1to32} -// CHECK: vcvtneph2hf8 ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: vcvtph2hf8 ymm2, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8 ymm2, zmmword ptr [2*ebp - 2048] + vcvtph2hf8 ymm2, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtneph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: vcvtph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0x51,0x7f] - vcvtneph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + vcvtph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] -// CHECK: vcvtneph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: vcvtph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x18,0x52,0x80] - vcvtneph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + vcvtph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} -// CHECK: vcvtneph2hf8s xmm2, xmm3 +// CHECK: vcvtph2hf8s xmm2, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0xd3] - vcvtneph2hf8s xmm2, xmm3 + vcvtph2hf8s xmm2, xmm3 -// CHECK: vcvtneph2hf8s xmm2 {k7}, xmm3 +// CHECK: vcvtph2hf8s xmm2 {k7}, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0xd3] - vcvtneph2hf8s xmm2 {k7}, xmm3 + vcvtph2hf8s xmm2 {k7}, xmm3 -// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, xmm3 +// CHECK: vcvtph2hf8s xmm2 {k7} {z}, xmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0xd3] - vcvtneph2hf8s xmm2 {k7} {z}, xmm3 + vcvtph2hf8s xmm2 {k7} {z}, xmm3 -// CHECK: vcvtneph2hf8s ymm2, zmm3 +// CHECK: vcvtph2hf8s ymm2, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0xd3] - vcvtneph2hf8s ymm2, zmm3 + vcvtph2hf8s ymm2, zmm3 -// CHECK: vcvtneph2hf8s ymm2 {k7}, zmm3 +// CHECK: vcvtph2hf8s ymm2 {k7}, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0xd3] - vcvtneph2hf8s ymm2 {k7}, zmm3 + vcvtph2hf8s ymm2 {k7}, zmm3 -// CHECK: vcvtneph2hf8s ymm2 {k7} {z}, zmm3 +// CHECK: vcvtph2hf8s ymm2 {k7} {z}, zmm3 // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0xd3] - vcvtneph2hf8s ymm2 {k7} {z}, zmm3 + vcvtph2hf8s ymm2 {k7} {z}, zmm3 -// CHECK: vcvtneph2hf8s xmm2, ymm3 +// CHECK: vcvtph2hf8s xmm2, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0xd3] - vcvtneph2hf8s xmm2, ymm3 + vcvtph2hf8s xmm2, ymm3 -// CHECK: vcvtneph2hf8s xmm2 {k7}, ymm3 +// CHECK: vcvtph2hf8s xmm2 {k7}, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x1b,0xd3] - vcvtneph2hf8s xmm2 {k7}, ymm3 + vcvtph2hf8s xmm2 {k7}, ymm3 -// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, ymm3 +// CHECK: vcvtph2hf8s xmm2 {k7} {z}, ymm3 // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0xd3] - vcvtneph2hf8s xmm2 {k7} {z}, ymm3 + vcvtph2hf8s xmm2 {k7} {z}, ymm3 -// CHECK: vcvtneph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] + vcvtph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + vcvtph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2hf8s xmm2, word ptr [eax]{1to8} +// CHECK: vcvtph2hf8s xmm2, word ptr [eax]{1to8} // CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x1b,0x10] - vcvtneph2hf8s xmm2, word ptr [eax]{1to8} + vcvtph2hf8s xmm2, word ptr [eax]{1to8} -// CHECK: vcvtneph2hf8s xmm2, xmmword ptr [2*ebp - 512] +// CHECK: vcvtph2hf8s xmm2, xmmword ptr [2*ebp - 512] // CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8s xmm2, xmmword ptr [2*ebp - 512] + vcvtph2hf8s xmm2, xmmword ptr [2*ebp - 512] -// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: vcvtph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] // CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0x51,0x7f] - vcvtneph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + vcvtph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] -// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: vcvtph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} // CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x1b,0x52,0x80] - vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + vcvtph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} -// CHECK: vcvtneph2hf8s xmm2, word ptr [eax]{1to16} +// CHECK: vcvtph2hf8s xmm2, word ptr [eax]{1to16} // CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x1b,0x10] - vcvtneph2hf8s xmm2, word ptr [eax]{1to16} + vcvtph2hf8s xmm2, word ptr [eax]{1to16} -// CHECK: vcvtneph2hf8s xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: vcvtph2hf8s xmm2, ymmword ptr [2*ebp - 1024] // CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8s xmm2, ymmword ptr [2*ebp - 1024] + vcvtph2hf8s xmm2, ymmword ptr [2*ebp - 1024] -// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: vcvtph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] // CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0x51,0x7f] - vcvtneph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + vcvtph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] -// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: vcvtph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} // CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x1b,0x52,0x80] - vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + vcvtph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} -// CHECK: vcvtneph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: vcvtph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] - vcvtneph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] + vcvtph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcvtneph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: vcvtph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] // CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] - vcvtneph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + vcvtph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] -// CHECK: vcvtneph2hf8s ymm2, word ptr [eax]{1to32} +// CHECK: vcvtph2hf8s ymm2, word ptr [eax]{1to32} // CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x1b,0x10] - vcvtneph2hf8s ymm2, word ptr [eax]{1to32} + vcvtph2hf8s ymm2, word ptr [eax]{1to32} -// CHECK: vcvtneph2hf8s ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: vcvtph2hf8s ymm2, zmmword ptr [2*ebp - 2048] // CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8s ymm2, zmmword ptr [2*ebp - 2048] + vcvtph2hf8s ymm2, zmmword ptr [2*ebp - 2048] -// CHECK: vcvtneph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: vcvtph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] // CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0x51,0x7f] - vcvtneph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + vcvtph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] -// CHECK: vcvtneph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: vcvtph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x1b,0x52,0x80] - vcvtneph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + vcvtph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} diff --git a/llvm/test/MC/X86/avx10.2convert-64-att.s b/llvm/test/MC/X86/avx10.2convert-64-att.s index ccf1e004c07f25..c39584ec096443 100644 --- a/llvm/test/MC/X86/avx10.2convert-64-att.s +++ b/llvm/test/MC/X86/avx10.2convert-64-att.s @@ -656,835 +656,835 @@ // CHECK: encoding: [0x62,0xe5,0x7f,0xcf,0x1e,0x72,0x80] vcvthf82ph -4096(%rdx), %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 // CHECK: encoding: [0x62,0x82,0x47,0x20,0x74,0xf0] - vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 + vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0x82,0x47,0x27,0x74,0xf0] - vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} + vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0x82,0x47,0xa7,0x74,0xf0] - vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 // CHECK: encoding: [0x62,0x82,0x47,0x40,0x74,0xf0] - vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 + vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0x82,0x47,0x47,0x74,0xf0] - vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} + vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0x82,0x47,0xc7,0x74,0xf0] - vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 // CHECK: encoding: [0x62,0x82,0x47,0x00,0x74,0xf0] - vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 + vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0x82,0x47,0x07,0x74,0xf0] - vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} + vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0x82,0x47,0x87,0x74,0xf0] - vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xa2,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 + vcvt2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0xc2,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + vcvt2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe2,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 + vcvt2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe2,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 + vcvt2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x74,0x71,0x7f] - vcvtne2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + vcvt2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x74,0x72,0x80] - vcvtne2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xa2,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 + vcvt2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xc2,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + vcvt2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe2,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 + vcvt2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe2,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 + vcvt2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x74,0x71,0x7f] - vcvtne2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + vcvt2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x74,0x72,0x80] - vcvtne2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa2,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 + vcvt2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xc2,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + vcvt2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe2,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 + vcvt2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe2,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 + vcvt2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x47,0x87,0x74,0x71,0x7f] - vcvtne2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + vcvt2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x47,0x97,0x74,0x72,0x80] - vcvtne2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 // CHECK: encoding: [0x62,0x85,0x47,0x20,0x74,0xf0] - vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 + vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x27,0x74,0xf0] - vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} + vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0xa7,0x74,0xf0] - vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 // CHECK: encoding: [0x62,0x85,0x47,0x40,0x74,0xf0] - vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 + vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x47,0x74,0xf0] - vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} + vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0xc7,0x74,0xf0] - vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 // CHECK: encoding: [0x62,0x85,0x47,0x00,0x74,0xf0] - vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 + vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x07,0x74,0xf0] - vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} + vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0x87,0x74,0xf0] - vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xa5,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 + vcvt2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + vcvt2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 + vcvt2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: vcvt2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 + vcvt2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 -// CHECK: vcvtne2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x74,0x71,0x7f] - vcvtne2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + vcvt2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x74,0x72,0x80] - vcvtne2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xa5,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 + vcvt2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + vcvt2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe5,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 + vcvt2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: vcvt2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe5,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 + vcvt2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 -// CHECK: vcvtne2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x74,0x71,0x7f] - vcvtne2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + vcvt2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x74,0x72,0x80] - vcvtne2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 + vcvt2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + vcvt2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 + vcvt2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: vcvt2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 + vcvt2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 -// CHECK: vcvtne2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0x87,0x74,0x71,0x7f] - vcvtne2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + vcvt2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0x97,0x74,0x72,0x80] - vcvtne2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 // CHECK: encoding: [0x62,0x85,0x47,0x20,0x18,0xf0] - vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 + vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x27,0x18,0xf0] - vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} + vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0xa7,0x18,0xf0] - vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 // CHECK: encoding: [0x62,0x85,0x47,0x40,0x18,0xf0] - vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 + vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x47,0x18,0xf0] - vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} + vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0xc7,0x18,0xf0] - vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 // CHECK: encoding: [0x62,0x85,0x47,0x00,0x18,0xf0] - vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 + vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x07,0x18,0xf0] - vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} + vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0x87,0x18,0xf0] - vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xa5,0x47,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 + vcvt2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + vcvt2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x50,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 + vcvt2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 + vcvt2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x18,0x71,0x7f] - vcvtne2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + vcvt2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x18,0x72,0x80] - vcvtne2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xa5,0x47,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 + vcvt2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + vcvt2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe5,0x47,0x30,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 + vcvt2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe5,0x47,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 + vcvt2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x18,0x71,0x7f] - vcvtne2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + vcvt2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x18,0x72,0x80] - vcvtne2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x47,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 + vcvt2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + vcvt2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x10,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 + vcvt2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 + vcvt2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0x87,0x18,0x71,0x7f] - vcvtne2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + vcvt2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0x97,0x18,0x72,0x80] - vcvtne2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 // CHECK: encoding: [0x62,0x85,0x47,0x20,0x1b,0xf0] - vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 + vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x27,0x1b,0xf0] - vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} + vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0xa7,0x1b,0xf0] - vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 // CHECK: encoding: [0x62,0x85,0x47,0x40,0x1b,0xf0] - vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 + vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x47,0x1b,0xf0] - vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} + vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0xc7,0x1b,0xf0] - vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 // CHECK: encoding: [0x62,0x85,0x47,0x00,0x1b,0xf0] - vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 + vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0x85,0x47,0x07,0x1b,0xf0] - vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} + vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0x85,0x47,0x87,0x1b,0xf0] - vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xa5,0x47,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 + vcvt2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: vcvt2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + vcvt2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} -// CHECK: vcvtne2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x50,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 + vcvt2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: vcvt2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 + vcvt2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 -// CHECK: vcvtne2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x1b,0x71,0x7f] - vcvtne2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + vcvt2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x1b,0x72,0x80] - vcvtne2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + vcvt2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xa5,0x47,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 + vcvt2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: vcvt2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + vcvt2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} -// CHECK: vcvtne2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe5,0x47,0x30,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 + vcvt2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: vcvt2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 // CHECK: encoding: [0x62,0xe5,0x47,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 + vcvt2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 -// CHECK: vcvtne2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x1b,0x71,0x7f] - vcvtne2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + vcvt2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x1b,0x72,0x80] - vcvtne2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + vcvt2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x47,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 + vcvt2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: vcvt2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x47,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + vcvt2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} -// CHECK: vcvtne2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x10,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 + vcvt2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: vcvt2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 // CHECK: encoding: [0x62,0xe5,0x47,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 + vcvt2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 -// CHECK: vcvtne2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0x87,0x1b,0x71,0x7f] - vcvtne2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + vcvt2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtne2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvt2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x47,0x97,0x1b,0x72,0x80] - vcvtne2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + vcvt2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8 %xmm23, %xmm22 +// CHECK: vcvtph2bf8 %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xf7] - vcvtneph2bf8 %xmm23, %xmm22 + vcvtph2bf8 %xmm23, %xmm22 -// CHECK: vcvtneph2bf8 %xmm23, %xmm22 {%k7} +// CHECK: vcvtph2bf8 %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa2,0x7e,0x0f,0x74,0xf7] - vcvtneph2bf8 %xmm23, %xmm22 {%k7} + vcvtph2bf8 %xmm23, %xmm22 {%k7} -// CHECK: vcvtneph2bf8 %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8 %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa2,0x7e,0x8f,0x74,0xf7] - vcvtneph2bf8 %xmm23, %xmm22 {%k7} {z} + vcvtph2bf8 %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8 %zmm23, %ymm22 +// CHECK: vcvtph2bf8 %zmm23, %ymm22 // CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xf7] - vcvtneph2bf8 %zmm23, %ymm22 + vcvtph2bf8 %zmm23, %ymm22 -// CHECK: vcvtneph2bf8 %zmm23, %ymm22 {%k7} +// CHECK: vcvtph2bf8 %zmm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xa2,0x7e,0x4f,0x74,0xf7] - vcvtneph2bf8 %zmm23, %ymm22 {%k7} + vcvtph2bf8 %zmm23, %ymm22 {%k7} -// CHECK: vcvtneph2bf8 %zmm23, %ymm22 {%k7} {z} +// CHECK: vcvtph2bf8 %zmm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xa2,0x7e,0xcf,0x74,0xf7] - vcvtneph2bf8 %zmm23, %ymm22 {%k7} {z} + vcvtph2bf8 %zmm23, %ymm22 {%k7} {z} -// CHECK: vcvtneph2bf8 %ymm23, %xmm22 +// CHECK: vcvtph2bf8 %ymm23, %xmm22 // CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x74,0xf7] - vcvtneph2bf8 %ymm23, %xmm22 + vcvtph2bf8 %ymm23, %xmm22 -// CHECK: vcvtneph2bf8 %ymm23, %xmm22 {%k7} +// CHECK: vcvtph2bf8 %ymm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa2,0x7e,0x2f,0x74,0xf7] - vcvtneph2bf8 %ymm23, %xmm22 {%k7} + vcvtph2bf8 %ymm23, %xmm22 {%k7} -// CHECK: vcvtneph2bf8 %ymm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8 %ymm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa2,0x7e,0xaf,0x74,0xf7] - vcvtneph2bf8 %ymm23, %xmm22 {%k7} {z} + vcvtph2bf8 %ymm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8x 268435456(%rbp,%r14,8), %xmm22 +// CHECK: vcvtph2bf8x 268435456(%rbp,%r14,8), %xmm22 // CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8x 268435456(%rbp,%r14,8), %xmm22 + vcvtph2bf8x 268435456(%rbp,%r14,8), %xmm22 -// CHECK: vcvtneph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: vcvtph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} // CHECK: encoding: [0x62,0xc2,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} + vcvtph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} -// CHECK: vcvtneph2bf8 (%rip){1to8}, %xmm22 +// CHECK: vcvtph2bf8 (%rip){1to8}, %xmm22 // CHECK: encoding: [0x62,0xe2,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8 (%rip){1to8}, %xmm22 + vcvtph2bf8 (%rip){1to8}, %xmm22 -// CHECK: vcvtneph2bf8x -512(,%rbp,2), %xmm22 +// CHECK: vcvtph2bf8x -512(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8x -512(,%rbp,2), %xmm22 + vcvtph2bf8x -512(,%rbp,2), %xmm22 -// CHECK: vcvtneph2bf8x 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8x 2032(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x7e,0x8f,0x74,0x71,0x7f] - vcvtneph2bf8x 2032(%rcx), %xmm22 {%k7} {z} + vcvtph2bf8x 2032(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x7e,0x9f,0x74,0x72,0x80] - vcvtneph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} + vcvtph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8 (%rip){1to16}, %xmm22 +// CHECK: vcvtph2bf8 (%rip){1to16}, %xmm22 // CHECK: encoding: [0x62,0xe2,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8 (%rip){1to16}, %xmm22 + vcvtph2bf8 (%rip){1to16}, %xmm22 -// CHECK: vcvtneph2bf8y -1024(,%rbp,2), %xmm22 +// CHECK: vcvtph2bf8y -1024(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8y -1024(,%rbp,2), %xmm22 + vcvtph2bf8y -1024(,%rbp,2), %xmm22 -// CHECK: vcvtneph2bf8y 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8y 4064(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x7e,0xaf,0x74,0x71,0x7f] - vcvtneph2bf8y 4064(%rcx), %xmm22 {%k7} {z} + vcvtph2bf8y 4064(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x7e,0xbf,0x74,0x72,0x80] - vcvtneph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} + vcvtph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8 268435456(%rbp,%r14,8), %ymm22 +// CHECK: vcvtph2bf8 268435456(%rbp,%r14,8), %ymm22 // CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8 268435456(%rbp,%r14,8), %ymm22 + vcvtph2bf8 268435456(%rbp,%r14,8), %ymm22 -// CHECK: vcvtneph2bf8 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: vcvtph2bf8 291(%r8,%rax,4), %ymm22 {%k7} // CHECK: encoding: [0x62,0xc2,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8 291(%r8,%rax,4), %ymm22 {%k7} + vcvtph2bf8 291(%r8,%rax,4), %ymm22 {%k7} -// CHECK: vcvtneph2bf8 (%rip){1to32}, %ymm22 +// CHECK: vcvtph2bf8 (%rip){1to32}, %ymm22 // CHECK: encoding: [0x62,0xe2,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8 (%rip){1to32}, %ymm22 + vcvtph2bf8 (%rip){1to32}, %ymm22 -// CHECK: vcvtneph2bf8 -2048(,%rbp,2), %ymm22 +// CHECK: vcvtph2bf8 -2048(,%rbp,2), %ymm22 // CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8 -2048(,%rbp,2), %ymm22 + vcvtph2bf8 -2048(,%rbp,2), %ymm22 -// CHECK: vcvtneph2bf8 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: vcvtph2bf8 8128(%rcx), %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x7e,0xcf,0x74,0x71,0x7f] - vcvtneph2bf8 8128(%rcx), %ymm22 {%k7} {z} + vcvtph2bf8 8128(%rcx), %ymm22 {%k7} {z} -// CHECK: vcvtneph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: vcvtph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe2,0x7e,0xdf,0x74,0x72,0x80] - vcvtneph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} + vcvtph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} -// CHECK: vcvtneph2bf8s %xmm23, %xmm22 +// CHECK: vcvtph2bf8s %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xf7] - vcvtneph2bf8s %xmm23, %xmm22 + vcvtph2bf8s %xmm23, %xmm22 -// CHECK: vcvtneph2bf8s %xmm23, %xmm22 {%k7} +// CHECK: vcvtph2bf8s %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x74,0xf7] - vcvtneph2bf8s %xmm23, %xmm22 {%k7} + vcvtph2bf8s %xmm23, %xmm22 {%k7} -// CHECK: vcvtneph2bf8s %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8s %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x74,0xf7] - vcvtneph2bf8s %xmm23, %xmm22 {%k7} {z} + vcvtph2bf8s %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8s %zmm23, %ymm22 +// CHECK: vcvtph2bf8s %zmm23, %ymm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xf7] - vcvtneph2bf8s %zmm23, %ymm22 + vcvtph2bf8s %zmm23, %ymm22 -// CHECK: vcvtneph2bf8s %zmm23, %ymm22 {%k7} +// CHECK: vcvtph2bf8s %zmm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x74,0xf7] - vcvtneph2bf8s %zmm23, %ymm22 {%k7} + vcvtph2bf8s %zmm23, %ymm22 {%k7} -// CHECK: vcvtneph2bf8s %zmm23, %ymm22 {%k7} {z} +// CHECK: vcvtph2bf8s %zmm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x74,0xf7] - vcvtneph2bf8s %zmm23, %ymm22 {%k7} {z} + vcvtph2bf8s %zmm23, %ymm22 {%k7} {z} -// CHECK: vcvtneph2bf8s %ymm23, %xmm22 +// CHECK: vcvtph2bf8s %ymm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x74,0xf7] - vcvtneph2bf8s %ymm23, %xmm22 + vcvtph2bf8s %ymm23, %xmm22 -// CHECK: vcvtneph2bf8s %ymm23, %xmm22 {%k7} +// CHECK: vcvtph2bf8s %ymm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x74,0xf7] - vcvtneph2bf8s %ymm23, %xmm22 {%k7} + vcvtph2bf8s %ymm23, %xmm22 {%k7} -// CHECK: vcvtneph2bf8s %ymm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8s %ymm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x74,0xf7] - vcvtneph2bf8s %ymm23, %xmm22 {%k7} {z} + vcvtph2bf8s %ymm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8sx 268435456(%rbp,%r14,8), %xmm22 +// CHECK: vcvtph2bf8sx 268435456(%rbp,%r14,8), %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8sx 268435456(%rbp,%r14,8), %xmm22 + vcvtph2bf8sx 268435456(%rbp,%r14,8), %xmm22 -// CHECK: vcvtneph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: vcvtph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} + vcvtph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} -// CHECK: vcvtneph2bf8s (%rip){1to8}, %xmm22 +// CHECK: vcvtph2bf8s (%rip){1to8}, %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8s (%rip){1to8}, %xmm22 + vcvtph2bf8s (%rip){1to8}, %xmm22 -// CHECK: vcvtneph2bf8sx -512(,%rbp,2), %xmm22 +// CHECK: vcvtph2bf8sx -512(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8sx -512(,%rbp,2), %xmm22 + vcvtph2bf8sx -512(,%rbp,2), %xmm22 -// CHECK: vcvtneph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x74,0x71,0x7f] - vcvtneph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} + vcvtph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x74,0x72,0x80] - vcvtneph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} + vcvtph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8s (%rip){1to16}, %xmm22 +// CHECK: vcvtph2bf8s (%rip){1to16}, %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8s (%rip){1to16}, %xmm22 + vcvtph2bf8s (%rip){1to16}, %xmm22 -// CHECK: vcvtneph2bf8sy -1024(,%rbp,2), %xmm22 +// CHECK: vcvtph2bf8sy -1024(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8sy -1024(,%rbp,2), %xmm22 + vcvtph2bf8sy -1024(,%rbp,2), %xmm22 -// CHECK: vcvtneph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x74,0x71,0x7f] - vcvtneph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} + vcvtph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: vcvtph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x74,0x72,0x80] - vcvtneph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} + vcvtph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2bf8s 268435456(%rbp,%r14,8), %ymm22 +// CHECK: vcvtph2bf8s 268435456(%rbp,%r14,8), %ymm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8s 268435456(%rbp,%r14,8), %ymm22 + vcvtph2bf8s 268435456(%rbp,%r14,8), %ymm22 -// CHECK: vcvtneph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: vcvtph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} + vcvtph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} -// CHECK: vcvtneph2bf8s (%rip){1to32}, %ymm22 +// CHECK: vcvtph2bf8s (%rip){1to32}, %ymm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8s (%rip){1to32}, %ymm22 + vcvtph2bf8s (%rip){1to32}, %ymm22 -// CHECK: vcvtneph2bf8s -2048(,%rbp,2), %ymm22 +// CHECK: vcvtph2bf8s -2048(,%rbp,2), %ymm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8s -2048(,%rbp,2), %ymm22 + vcvtph2bf8s -2048(,%rbp,2), %ymm22 -// CHECK: vcvtneph2bf8s 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: vcvtph2bf8s 8128(%rcx), %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x74,0x71,0x7f] - vcvtneph2bf8s 8128(%rcx), %ymm22 {%k7} {z} + vcvtph2bf8s 8128(%rcx), %ymm22 {%k7} {z} -// CHECK: vcvtneph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: vcvtph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x74,0x72,0x80] - vcvtneph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} + vcvtph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} -// CHECK: vcvtneph2hf8 %xmm23, %xmm22 +// CHECK: vcvtph2hf8 %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xf7] - vcvtneph2hf8 %xmm23, %xmm22 + vcvtph2hf8 %xmm23, %xmm22 -// CHECK: vcvtneph2hf8 %xmm23, %xmm22 {%k7} +// CHECK: vcvtph2hf8 %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x18,0xf7] - vcvtneph2hf8 %xmm23, %xmm22 {%k7} + vcvtph2hf8 %xmm23, %xmm22 {%k7} -// CHECK: vcvtneph2hf8 %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8 %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x18,0xf7] - vcvtneph2hf8 %xmm23, %xmm22 {%k7} {z} + vcvtph2hf8 %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8 %zmm23, %ymm22 +// CHECK: vcvtph2hf8 %zmm23, %ymm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xf7] - vcvtneph2hf8 %zmm23, %ymm22 + vcvtph2hf8 %zmm23, %ymm22 -// CHECK: vcvtneph2hf8 %zmm23, %ymm22 {%k7} +// CHECK: vcvtph2hf8 %zmm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x18,0xf7] - vcvtneph2hf8 %zmm23, %ymm22 {%k7} + vcvtph2hf8 %zmm23, %ymm22 {%k7} -// CHECK: vcvtneph2hf8 %zmm23, %ymm22 {%k7} {z} +// CHECK: vcvtph2hf8 %zmm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x18,0xf7] - vcvtneph2hf8 %zmm23, %ymm22 {%k7} {z} + vcvtph2hf8 %zmm23, %ymm22 {%k7} {z} -// CHECK: vcvtneph2hf8 %ymm23, %xmm22 +// CHECK: vcvtph2hf8 %ymm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x18,0xf7] - vcvtneph2hf8 %ymm23, %xmm22 + vcvtph2hf8 %ymm23, %xmm22 -// CHECK: vcvtneph2hf8 %ymm23, %xmm22 {%k7} +// CHECK: vcvtph2hf8 %ymm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x18,0xf7] - vcvtneph2hf8 %ymm23, %xmm22 {%k7} + vcvtph2hf8 %ymm23, %xmm22 {%k7} -// CHECK: vcvtneph2hf8 %ymm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8 %ymm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x18,0xf7] - vcvtneph2hf8 %ymm23, %xmm22 {%k7} {z} + vcvtph2hf8 %ymm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8x 268435456(%rbp,%r14,8), %xmm22 +// CHECK: vcvtph2hf8x 268435456(%rbp,%r14,8), %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8x 268435456(%rbp,%r14,8), %xmm22 + vcvtph2hf8x 268435456(%rbp,%r14,8), %xmm22 -// CHECK: vcvtneph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: vcvtph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} + vcvtph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} -// CHECK: vcvtneph2hf8 (%rip){1to8}, %xmm22 +// CHECK: vcvtph2hf8 (%rip){1to8}, %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8 (%rip){1to8}, %xmm22 + vcvtph2hf8 (%rip){1to8}, %xmm22 -// CHECK: vcvtneph2hf8x -512(,%rbp,2), %xmm22 +// CHECK: vcvtph2hf8x -512(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8x -512(,%rbp,2), %xmm22 + vcvtph2hf8x -512(,%rbp,2), %xmm22 -// CHECK: vcvtneph2hf8x 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8x 2032(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x18,0x71,0x7f] - vcvtneph2hf8x 2032(%rcx), %xmm22 {%k7} {z} + vcvtph2hf8x 2032(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x18,0x72,0x80] - vcvtneph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} + vcvtph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8 (%rip){1to16}, %xmm22 +// CHECK: vcvtph2hf8 (%rip){1to16}, %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8 (%rip){1to16}, %xmm22 + vcvtph2hf8 (%rip){1to16}, %xmm22 -// CHECK: vcvtneph2hf8y -1024(,%rbp,2), %xmm22 +// CHECK: vcvtph2hf8y -1024(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8y -1024(,%rbp,2), %xmm22 + vcvtph2hf8y -1024(,%rbp,2), %xmm22 -// CHECK: vcvtneph2hf8y 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8y 4064(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x18,0x71,0x7f] - vcvtneph2hf8y 4064(%rcx), %xmm22 {%k7} {z} + vcvtph2hf8y 4064(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x18,0x72,0x80] - vcvtneph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} + vcvtph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8 268435456(%rbp,%r14,8), %ymm22 +// CHECK: vcvtph2hf8 268435456(%rbp,%r14,8), %ymm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8 268435456(%rbp,%r14,8), %ymm22 + vcvtph2hf8 268435456(%rbp,%r14,8), %ymm22 -// CHECK: vcvtneph2hf8 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: vcvtph2hf8 291(%r8,%rax,4), %ymm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8 291(%r8,%rax,4), %ymm22 {%k7} + vcvtph2hf8 291(%r8,%rax,4), %ymm22 {%k7} -// CHECK: vcvtneph2hf8 (%rip){1to32}, %ymm22 +// CHECK: vcvtph2hf8 (%rip){1to32}, %ymm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8 (%rip){1to32}, %ymm22 + vcvtph2hf8 (%rip){1to32}, %ymm22 -// CHECK: vcvtneph2hf8 -2048(,%rbp,2), %ymm22 +// CHECK: vcvtph2hf8 -2048(,%rbp,2), %ymm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8 -2048(,%rbp,2), %ymm22 + vcvtph2hf8 -2048(,%rbp,2), %ymm22 -// CHECK: vcvtneph2hf8 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: vcvtph2hf8 8128(%rcx), %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x18,0x71,0x7f] - vcvtneph2hf8 8128(%rcx), %ymm22 {%k7} {z} + vcvtph2hf8 8128(%rcx), %ymm22 {%k7} {z} -// CHECK: vcvtneph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: vcvtph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x18,0x72,0x80] - vcvtneph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} + vcvtph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} -// CHECK: vcvtneph2hf8s %xmm23, %xmm22 +// CHECK: vcvtph2hf8s %xmm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xf7] - vcvtneph2hf8s %xmm23, %xmm22 + vcvtph2hf8s %xmm23, %xmm22 -// CHECK: vcvtneph2hf8s %xmm23, %xmm22 {%k7} +// CHECK: vcvtph2hf8s %xmm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x1b,0xf7] - vcvtneph2hf8s %xmm23, %xmm22 {%k7} + vcvtph2hf8s %xmm23, %xmm22 {%k7} -// CHECK: vcvtneph2hf8s %xmm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8s %xmm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x1b,0xf7] - vcvtneph2hf8s %xmm23, %xmm22 {%k7} {z} + vcvtph2hf8s %xmm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8s %zmm23, %ymm22 +// CHECK: vcvtph2hf8s %zmm23, %ymm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xf7] - vcvtneph2hf8s %zmm23, %ymm22 + vcvtph2hf8s %zmm23, %ymm22 -// CHECK: vcvtneph2hf8s %zmm23, %ymm22 {%k7} +// CHECK: vcvtph2hf8s %zmm23, %ymm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x1b,0xf7] - vcvtneph2hf8s %zmm23, %ymm22 {%k7} + vcvtph2hf8s %zmm23, %ymm22 {%k7} -// CHECK: vcvtneph2hf8s %zmm23, %ymm22 {%k7} {z} +// CHECK: vcvtph2hf8s %zmm23, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x1b,0xf7] - vcvtneph2hf8s %zmm23, %ymm22 {%k7} {z} + vcvtph2hf8s %zmm23, %ymm22 {%k7} {z} -// CHECK: vcvtneph2hf8s %ymm23, %xmm22 +// CHECK: vcvtph2hf8s %ymm23, %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x1b,0xf7] - vcvtneph2hf8s %ymm23, %xmm22 + vcvtph2hf8s %ymm23, %xmm22 -// CHECK: vcvtneph2hf8s %ymm23, %xmm22 {%k7} +// CHECK: vcvtph2hf8s %ymm23, %xmm22 {%k7} // CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x1b,0xf7] - vcvtneph2hf8s %ymm23, %xmm22 {%k7} + vcvtph2hf8s %ymm23, %xmm22 {%k7} -// CHECK: vcvtneph2hf8s %ymm23, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8s %ymm23, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x1b,0xf7] - vcvtneph2hf8s %ymm23, %xmm22 {%k7} {z} + vcvtph2hf8s %ymm23, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8sx 268435456(%rbp,%r14,8), %xmm22 +// CHECK: vcvtph2hf8sx 268435456(%rbp,%r14,8), %xmm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8sx 268435456(%rbp,%r14,8), %xmm22 + vcvtph2hf8sx 268435456(%rbp,%r14,8), %xmm22 -// CHECK: vcvtneph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: vcvtph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} + vcvtph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} -// CHECK: vcvtneph2hf8s (%rip){1to8}, %xmm22 +// CHECK: vcvtph2hf8s (%rip){1to8}, %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8s (%rip){1to8}, %xmm22 + vcvtph2hf8s (%rip){1to8}, %xmm22 -// CHECK: vcvtneph2hf8sx -512(,%rbp,2), %xmm22 +// CHECK: vcvtph2hf8sx -512(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8sx -512(,%rbp,2), %xmm22 + vcvtph2hf8sx -512(,%rbp,2), %xmm22 -// CHECK: vcvtneph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x1b,0x71,0x7f] - vcvtneph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} + vcvtph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x1b,0x72,0x80] - vcvtneph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} + vcvtph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8s (%rip){1to16}, %xmm22 +// CHECK: vcvtph2hf8s (%rip){1to16}, %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8s (%rip){1to16}, %xmm22 + vcvtph2hf8s (%rip){1to16}, %xmm22 -// CHECK: vcvtneph2hf8sy -1024(,%rbp,2), %xmm22 +// CHECK: vcvtph2hf8sy -1024(,%rbp,2), %xmm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8sy -1024(,%rbp,2), %xmm22 + vcvtph2hf8sy -1024(,%rbp,2), %xmm22 -// CHECK: vcvtneph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x1b,0x71,0x7f] - vcvtneph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} + vcvtph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: vcvtph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x1b,0x72,0x80] - vcvtneph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} + vcvtph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} -// CHECK: vcvtneph2hf8s 268435456(%rbp,%r14,8), %ymm22 +// CHECK: vcvtph2hf8s 268435456(%rbp,%r14,8), %ymm22 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8s 268435456(%rbp,%r14,8), %ymm22 + vcvtph2hf8s 268435456(%rbp,%r14,8), %ymm22 -// CHECK: vcvtneph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: vcvtph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} // CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} + vcvtph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} -// CHECK: vcvtneph2hf8s (%rip){1to32}, %ymm22 +// CHECK: vcvtph2hf8s (%rip){1to32}, %ymm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8s (%rip){1to32}, %ymm22 + vcvtph2hf8s (%rip){1to32}, %ymm22 -// CHECK: vcvtneph2hf8s -2048(,%rbp,2), %ymm22 +// CHECK: vcvtph2hf8s -2048(,%rbp,2), %ymm22 // CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8s -2048(,%rbp,2), %ymm22 + vcvtph2hf8s -2048(,%rbp,2), %ymm22 -// CHECK: vcvtneph2hf8s 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: vcvtph2hf8s 8128(%rcx), %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x1b,0x71,0x7f] - vcvtneph2hf8s 8128(%rcx), %ymm22 {%k7} {z} + vcvtph2hf8s 8128(%rcx), %ymm22 {%k7} {z} -// CHECK: vcvtneph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: vcvtph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} // CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x1b,0x72,0x80] - vcvtneph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} + vcvtph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} diff --git a/llvm/test/MC/X86/avx10.2convert-64-intel.s b/llvm/test/MC/X86/avx10.2convert-64-intel.s index 2f0cd1b2809357..35f3b4a8f1a1ab 100644 --- a/llvm/test/MC/X86/avx10.2convert-64-intel.s +++ b/llvm/test/MC/X86/avx10.2convert-64-intel.s @@ -656,835 +656,835 @@ // CHECK: encoding: [0x62,0xe5,0x7f,0xcf,0x1e,0x72,0x80] vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rdx - 4096] -// CHECK: vcvtne2ph2bf8 ymm22, ymm23, ymm24 +// CHECK: vcvt2ph2bf8 ymm22, ymm23, ymm24 // CHECK: encoding: [0x62,0x82,0x47,0x20,0x74,0xf0] - vcvtne2ph2bf8 ymm22, ymm23, ymm24 + vcvt2ph2bf8 ymm22, ymm23, ymm24 -// CHECK: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymm24 +// CHECK: vcvt2ph2bf8 ymm22 {k7}, ymm23, ymm24 // CHECK: encoding: [0x62,0x82,0x47,0x27,0x74,0xf0] - vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymm24 + vcvt2ph2bf8 ymm22 {k7}, ymm23, ymm24 -// CHECK: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 // CHECK: encoding: [0x62,0x82,0x47,0xa7,0x74,0xf0] - vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 + vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 -// CHECK: vcvtne2ph2bf8 zmm22, zmm23, zmm24 +// CHECK: vcvt2ph2bf8 zmm22, zmm23, zmm24 // CHECK: encoding: [0x62,0x82,0x47,0x40,0x74,0xf0] - vcvtne2ph2bf8 zmm22, zmm23, zmm24 + vcvt2ph2bf8 zmm22, zmm23, zmm24 -// CHECK: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmm24 +// CHECK: vcvt2ph2bf8 zmm22 {k7}, zmm23, zmm24 // CHECK: encoding: [0x62,0x82,0x47,0x47,0x74,0xf0] - vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmm24 + vcvt2ph2bf8 zmm22 {k7}, zmm23, zmm24 -// CHECK: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 // CHECK: encoding: [0x62,0x82,0x47,0xc7,0x74,0xf0] - vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 + vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 -// CHECK: vcvtne2ph2bf8 xmm22, xmm23, xmm24 +// CHECK: vcvt2ph2bf8 xmm22, xmm23, xmm24 // CHECK: encoding: [0x62,0x82,0x47,0x00,0x74,0xf0] - vcvtne2ph2bf8 xmm22, xmm23, xmm24 + vcvt2ph2bf8 xmm22, xmm23, xmm24 -// CHECK: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmm24 +// CHECK: vcvt2ph2bf8 xmm22 {k7}, xmm23, xmm24 // CHECK: encoding: [0x62,0x82,0x47,0x07,0x74,0xf0] - vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmm24 + vcvt2ph2bf8 xmm22 {k7}, xmm23, xmm24 -// CHECK: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 // CHECK: encoding: [0x62,0x82,0x47,0x87,0x74,0xf0] - vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 + vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 -// CHECK: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa2,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc2,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + vcvt2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: vcvt2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe2,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} + vcvt2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} -// CHECK: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: vcvt2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe2,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + vcvt2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x74,0x71,0x7f] - vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] -// CHECK: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x74,0x72,0x80] - vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + vcvt2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} -// CHECK: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa2,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc2,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + vcvt2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: vcvt2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe2,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} + vcvt2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} -// CHECK: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: vcvt2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe2,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + vcvt2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x74,0x71,0x7f] - vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] -// CHECK: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x74,0x72,0x80] - vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + vcvt2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} -// CHECK: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa2,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc2,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + vcvt2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: vcvt2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe2,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} + vcvt2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} -// CHECK: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: vcvt2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe2,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] + vcvt2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] -// CHECK: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe2,0x47,0x87,0x74,0x71,0x7f] - vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] -// CHECK: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe2,0x47,0x97,0x74,0x72,0x80] - vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + vcvt2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} -// CHECK: vcvtne2ph2bf8s ymm22, ymm23, ymm24 +// CHECK: vcvt2ph2bf8s ymm22, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0x20,0x74,0xf0] - vcvtne2ph2bf8s ymm22, ymm23, ymm24 + vcvt2ph2bf8s ymm22, ymm23, ymm24 -// CHECK: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymm24 +// CHECK: vcvt2ph2bf8s ymm22 {k7}, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0x27,0x74,0xf0] - vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymm24 + vcvt2ph2bf8s ymm22 {k7}, ymm23, ymm24 -// CHECK: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0xa7,0x74,0xf0] - vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 + vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 -// CHECK: vcvtne2ph2bf8s zmm22, zmm23, zmm24 +// CHECK: vcvt2ph2bf8s zmm22, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0x40,0x74,0xf0] - vcvtne2ph2bf8s zmm22, zmm23, zmm24 + vcvt2ph2bf8s zmm22, zmm23, zmm24 -// CHECK: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmm24 +// CHECK: vcvt2ph2bf8s zmm22 {k7}, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0x47,0x74,0xf0] - vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmm24 + vcvt2ph2bf8s zmm22 {k7}, zmm23, zmm24 -// CHECK: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0xc7,0x74,0xf0] - vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 + vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 -// CHECK: vcvtne2ph2bf8s xmm22, xmm23, xmm24 +// CHECK: vcvt2ph2bf8s xmm22, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x00,0x74,0xf0] - vcvtne2ph2bf8s xmm22, xmm23, xmm24 + vcvt2ph2bf8s xmm22, xmm23, xmm24 -// CHECK: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmm24 +// CHECK: vcvt2ph2bf8s xmm22 {k7}, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x07,0x74,0xf0] - vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmm24 + vcvt2ph2bf8s xmm22 {k7}, xmm23, xmm24 -// CHECK: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x87,0x74,0xf0] - vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 + vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 -// CHECK: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + vcvt2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: vcvt2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe5,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} + vcvt2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} -// CHECK: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: vcvt2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe5,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] + vcvt2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x74,0x71,0x7f] - vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] -// CHECK: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x74,0x72,0x80] - vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + vcvt2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} -// CHECK: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + vcvt2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: vcvt2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe5,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} + vcvt2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} -// CHECK: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: vcvt2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe5,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] + vcvt2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x74,0x71,0x7f] - vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] -// CHECK: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x74,0x72,0x80] - vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + vcvt2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} -// CHECK: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + vcvt2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: vcvt2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe5,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} + vcvt2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} -// CHECK: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: vcvt2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe5,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] + vcvt2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] -// CHECK: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe5,0x47,0x87,0x74,0x71,0x7f] - vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] -// CHECK: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe5,0x47,0x97,0x74,0x72,0x80] - vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + vcvt2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} -// CHECK: vcvtne2ph2hf8 ymm22, ymm23, ymm24 +// CHECK: vcvt2ph2hf8 ymm22, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0x20,0x18,0xf0] - vcvtne2ph2hf8 ymm22, ymm23, ymm24 + vcvt2ph2hf8 ymm22, ymm23, ymm24 -// CHECK: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymm24 +// CHECK: vcvt2ph2hf8 ymm22 {k7}, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0x27,0x18,0xf0] - vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymm24 + vcvt2ph2hf8 ymm22 {k7}, ymm23, ymm24 -// CHECK: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0xa7,0x18,0xf0] - vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 + vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 -// CHECK: vcvtne2ph2hf8 zmm22, zmm23, zmm24 +// CHECK: vcvt2ph2hf8 zmm22, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0x40,0x18,0xf0] - vcvtne2ph2hf8 zmm22, zmm23, zmm24 + vcvt2ph2hf8 zmm22, zmm23, zmm24 -// CHECK: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmm24 +// CHECK: vcvt2ph2hf8 zmm22 {k7}, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0x47,0x18,0xf0] - vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmm24 + vcvt2ph2hf8 zmm22 {k7}, zmm23, zmm24 -// CHECK: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0xc7,0x18,0xf0] - vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 + vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 -// CHECK: vcvtne2ph2hf8 xmm22, xmm23, xmm24 +// CHECK: vcvt2ph2hf8 xmm22, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x00,0x18,0xf0] - vcvtne2ph2hf8 xmm22, xmm23, xmm24 + vcvt2ph2hf8 xmm22, xmm23, xmm24 -// CHECK: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmm24 +// CHECK: vcvt2ph2hf8 xmm22 {k7}, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x07,0x18,0xf0] - vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmm24 + vcvt2ph2hf8 xmm22 {k7}, xmm23, xmm24 -// CHECK: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x87,0x18,0xf0] - vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 + vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 -// CHECK: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + vcvt2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: vcvt2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe5,0x47,0x50,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} + vcvt2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} -// CHECK: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: vcvt2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe5,0x47,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + vcvt2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x18,0x71,0x7f] - vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] -// CHECK: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x18,0x72,0x80] - vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + vcvt2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} -// CHECK: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + vcvt2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: vcvt2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe5,0x47,0x30,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} + vcvt2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} -// CHECK: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: vcvt2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe5,0x47,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + vcvt2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x18,0x71,0x7f] - vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] -// CHECK: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x18,0x72,0x80] - vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + vcvt2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} -// CHECK: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + vcvt2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: vcvt2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe5,0x47,0x10,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} + vcvt2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} -// CHECK: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: vcvt2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe5,0x47,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] + vcvt2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] -// CHECK: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe5,0x47,0x87,0x18,0x71,0x7f] - vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] -// CHECK: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe5,0x47,0x97,0x18,0x72,0x80] - vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + vcvt2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} -// CHECK: vcvtne2ph2hf8s ymm22, ymm23, ymm24 +// CHECK: vcvt2ph2hf8s ymm22, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0x20,0x1b,0xf0] - vcvtne2ph2hf8s ymm22, ymm23, ymm24 + vcvt2ph2hf8s ymm22, ymm23, ymm24 -// CHECK: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymm24 +// CHECK: vcvt2ph2hf8s ymm22 {k7}, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0x27,0x1b,0xf0] - vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymm24 + vcvt2ph2hf8s ymm22 {k7}, ymm23, ymm24 -// CHECK: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 // CHECK: encoding: [0x62,0x85,0x47,0xa7,0x1b,0xf0] - vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 + vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 -// CHECK: vcvtne2ph2hf8s zmm22, zmm23, zmm24 +// CHECK: vcvt2ph2hf8s zmm22, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0x40,0x1b,0xf0] - vcvtne2ph2hf8s zmm22, zmm23, zmm24 + vcvt2ph2hf8s zmm22, zmm23, zmm24 -// CHECK: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmm24 +// CHECK: vcvt2ph2hf8s zmm22 {k7}, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0x47,0x1b,0xf0] - vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmm24 + vcvt2ph2hf8s zmm22 {k7}, zmm23, zmm24 -// CHECK: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 // CHECK: encoding: [0x62,0x85,0x47,0xc7,0x1b,0xf0] - vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 + vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 -// CHECK: vcvtne2ph2hf8s xmm22, xmm23, xmm24 +// CHECK: vcvt2ph2hf8s xmm22, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x00,0x1b,0xf0] - vcvtne2ph2hf8s xmm22, xmm23, xmm24 + vcvt2ph2hf8s xmm22, xmm23, xmm24 -// CHECK: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmm24 +// CHECK: vcvt2ph2hf8s xmm22 {k7}, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x07,0x1b,0xf0] - vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmm24 + vcvt2ph2hf8s xmm22 {k7}, xmm23, xmm24 -// CHECK: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 // CHECK: encoding: [0x62,0x85,0x47,0x87,0x1b,0xf0] - vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 + vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 -// CHECK: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + vcvt2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: vcvt2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe5,0x47,0x50,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} + vcvt2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} -// CHECK: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: vcvt2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe5,0x47,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] + vcvt2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x1b,0x71,0x7f] - vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] -// CHECK: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x1b,0x72,0x80] - vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + vcvt2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} -// CHECK: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + vcvt2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: vcvt2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe5,0x47,0x30,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} + vcvt2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} -// CHECK: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: vcvt2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe5,0x47,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] + vcvt2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x1b,0x71,0x7f] - vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] -// CHECK: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x1b,0x72,0x80] - vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + vcvt2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} -// CHECK: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvt2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x47,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + vcvt2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvt2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x47,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + vcvt2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtne2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: vcvt2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe5,0x47,0x10,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtne2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} + vcvt2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} -// CHECK: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: vcvt2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe5,0x47,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] + vcvt2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] -// CHECK: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe5,0x47,0x87,0x1b,0x71,0x7f] - vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] -// CHECK: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe5,0x47,0x97,0x1b,0x72,0x80] - vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + vcvt2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} -// CHECK: vcvtneph2bf8 xmm22, xmm23 +// CHECK: vcvtph2bf8 xmm22, xmm23 // CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xf7] - vcvtneph2bf8 xmm22, xmm23 + vcvtph2bf8 xmm22, xmm23 -// CHECK: vcvtneph2bf8 xmm22 {k7}, xmm23 +// CHECK: vcvtph2bf8 xmm22 {k7}, xmm23 // CHECK: encoding: [0x62,0xa2,0x7e,0x0f,0x74,0xf7] - vcvtneph2bf8 xmm22 {k7}, xmm23 + vcvtph2bf8 xmm22 {k7}, xmm23 -// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, xmm23 +// CHECK: vcvtph2bf8 xmm22 {k7} {z}, xmm23 // CHECK: encoding: [0x62,0xa2,0x7e,0x8f,0x74,0xf7] - vcvtneph2bf8 xmm22 {k7} {z}, xmm23 + vcvtph2bf8 xmm22 {k7} {z}, xmm23 -// CHECK: vcvtneph2bf8 ymm22, zmm23 +// CHECK: vcvtph2bf8 ymm22, zmm23 // CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xf7] - vcvtneph2bf8 ymm22, zmm23 + vcvtph2bf8 ymm22, zmm23 -// CHECK: vcvtneph2bf8 ymm22 {k7}, zmm23 +// CHECK: vcvtph2bf8 ymm22 {k7}, zmm23 // CHECK: encoding: [0x62,0xa2,0x7e,0x4f,0x74,0xf7] - vcvtneph2bf8 ymm22 {k7}, zmm23 + vcvtph2bf8 ymm22 {k7}, zmm23 -// CHECK: vcvtneph2bf8 ymm22 {k7} {z}, zmm23 +// CHECK: vcvtph2bf8 ymm22 {k7} {z}, zmm23 // CHECK: encoding: [0x62,0xa2,0x7e,0xcf,0x74,0xf7] - vcvtneph2bf8 ymm22 {k7} {z}, zmm23 + vcvtph2bf8 ymm22 {k7} {z}, zmm23 -// CHECK: vcvtneph2bf8 xmm22, ymm23 +// CHECK: vcvtph2bf8 xmm22, ymm23 // CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x74,0xf7] - vcvtneph2bf8 xmm22, ymm23 + vcvtph2bf8 xmm22, ymm23 -// CHECK: vcvtneph2bf8 xmm22 {k7}, ymm23 +// CHECK: vcvtph2bf8 xmm22 {k7}, ymm23 // CHECK: encoding: [0x62,0xa2,0x7e,0x2f,0x74,0xf7] - vcvtneph2bf8 xmm22 {k7}, ymm23 + vcvtph2bf8 xmm22 {k7}, ymm23 -// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, ymm23 +// CHECK: vcvtph2bf8 xmm22 {k7} {z}, ymm23 // CHECK: encoding: [0x62,0xa2,0x7e,0xaf,0x74,0xf7] - vcvtneph2bf8 xmm22 {k7} {z}, ymm23 + vcvtph2bf8 xmm22 {k7} {z}, ymm23 -// CHECK: vcvtneph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc2,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + vcvtph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2bf8 xmm22, word ptr [rip]{1to8} +// CHECK: vcvtph2bf8 xmm22, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe2,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8 xmm22, word ptr [rip]{1to8} + vcvtph2bf8 xmm22, word ptr [rip]{1to8} -// CHECK: vcvtneph2bf8 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: vcvtph2bf8 xmm22, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8 xmm22, xmmword ptr [2*rbp - 512] + vcvtph2bf8 xmm22, xmmword ptr [2*rbp - 512] -// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: vcvtph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe2,0x7e,0x8f,0x74,0x71,0x7f] - vcvtneph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + vcvtph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] -// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: vcvtph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe2,0x7e,0x9f,0x74,0x72,0x80] - vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + vcvtph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} -// CHECK: vcvtneph2bf8 xmm22, word ptr [rip]{1to16} +// CHECK: vcvtph2bf8 xmm22, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe2,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8 xmm22, word ptr [rip]{1to16} + vcvtph2bf8 xmm22, word ptr [rip]{1to16} -// CHECK: vcvtneph2bf8 xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: vcvtph2bf8 xmm22, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8 xmm22, ymmword ptr [2*rbp - 1024] + vcvtph2bf8 xmm22, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: vcvtph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe2,0x7e,0xaf,0x74,0x71,0x7f] - vcvtneph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + vcvtph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] -// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: vcvtph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe2,0x7e,0xbf,0x74,0x72,0x80] - vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + vcvtph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} -// CHECK: vcvtneph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc2,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + vcvtph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2bf8 ymm22, word ptr [rip]{1to32} +// CHECK: vcvtph2bf8 ymm22, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe2,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8 ymm22, word ptr [rip]{1to32} + vcvtph2bf8 ymm22, word ptr [rip]{1to32} -// CHECK: vcvtneph2bf8 ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: vcvtph2bf8 ymm22, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8 ymm22, zmmword ptr [2*rbp - 2048] + vcvtph2bf8 ymm22, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtneph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: vcvtph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe2,0x7e,0xcf,0x74,0x71,0x7f] - vcvtneph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + vcvtph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] -// CHECK: vcvtneph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: vcvtph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe2,0x7e,0xdf,0x74,0x72,0x80] - vcvtneph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + vcvtph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} -// CHECK: vcvtneph2bf8s xmm22, xmm23 +// CHECK: vcvtph2bf8s xmm22, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xf7] - vcvtneph2bf8s xmm22, xmm23 + vcvtph2bf8s xmm22, xmm23 -// CHECK: vcvtneph2bf8s xmm22 {k7}, xmm23 +// CHECK: vcvtph2bf8s xmm22 {k7}, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x74,0xf7] - vcvtneph2bf8s xmm22 {k7}, xmm23 + vcvtph2bf8s xmm22 {k7}, xmm23 -// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, xmm23 +// CHECK: vcvtph2bf8s xmm22 {k7} {z}, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x74,0xf7] - vcvtneph2bf8s xmm22 {k7} {z}, xmm23 + vcvtph2bf8s xmm22 {k7} {z}, xmm23 -// CHECK: vcvtneph2bf8s ymm22, zmm23 +// CHECK: vcvtph2bf8s ymm22, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xf7] - vcvtneph2bf8s ymm22, zmm23 + vcvtph2bf8s ymm22, zmm23 -// CHECK: vcvtneph2bf8s ymm22 {k7}, zmm23 +// CHECK: vcvtph2bf8s ymm22 {k7}, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x74,0xf7] - vcvtneph2bf8s ymm22 {k7}, zmm23 + vcvtph2bf8s ymm22 {k7}, zmm23 -// CHECK: vcvtneph2bf8s ymm22 {k7} {z}, zmm23 +// CHECK: vcvtph2bf8s ymm22 {k7} {z}, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x74,0xf7] - vcvtneph2bf8s ymm22 {k7} {z}, zmm23 + vcvtph2bf8s ymm22 {k7} {z}, zmm23 -// CHECK: vcvtneph2bf8s xmm22, ymm23 +// CHECK: vcvtph2bf8s xmm22, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x74,0xf7] - vcvtneph2bf8s xmm22, ymm23 + vcvtph2bf8s xmm22, ymm23 -// CHECK: vcvtneph2bf8s xmm22 {k7}, ymm23 +// CHECK: vcvtph2bf8s xmm22 {k7}, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x74,0xf7] - vcvtneph2bf8s xmm22 {k7}, ymm23 + vcvtph2bf8s xmm22 {k7}, ymm23 -// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, ymm23 +// CHECK: vcvtph2bf8s xmm22 {k7} {z}, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x74,0xf7] - vcvtneph2bf8s xmm22 {k7} {z}, ymm23 + vcvtph2bf8s xmm22 {k7} {z}, ymm23 -// CHECK: vcvtneph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + vcvtph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2bf8s xmm22, word ptr [rip]{1to8} +// CHECK: vcvtph2bf8s xmm22, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8s xmm22, word ptr [rip]{1to8} + vcvtph2bf8s xmm22, word ptr [rip]{1to8} -// CHECK: vcvtneph2bf8s xmm22, xmmword ptr [2*rbp - 512] +// CHECK: vcvtph2bf8s xmm22, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2bf8s xmm22, xmmword ptr [2*rbp - 512] + vcvtph2bf8s xmm22, xmmword ptr [2*rbp - 512] -// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: vcvtph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x74,0x71,0x7f] - vcvtneph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + vcvtph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] -// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: vcvtph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x74,0x72,0x80] - vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + vcvtph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} -// CHECK: vcvtneph2bf8s xmm22, word ptr [rip]{1to16} +// CHECK: vcvtph2bf8s xmm22, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8s xmm22, word ptr [rip]{1to16} + vcvtph2bf8s xmm22, word ptr [rip]{1to16} -// CHECK: vcvtneph2bf8s xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: vcvtph2bf8s xmm22, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2bf8s xmm22, ymmword ptr [2*rbp - 1024] + vcvtph2bf8s xmm22, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: vcvtph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x74,0x71,0x7f] - vcvtneph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + vcvtph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] -// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: vcvtph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x74,0x72,0x80] - vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + vcvtph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} -// CHECK: vcvtneph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + vcvtph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2bf8s ymm22, word ptr [rip]{1to32} +// CHECK: vcvtph2bf8s ymm22, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] - vcvtneph2bf8s ymm22, word ptr [rip]{1to32} + vcvtph2bf8s ymm22, word ptr [rip]{1to32} -// CHECK: vcvtneph2bf8s ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: vcvtph2bf8s ymm22, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2bf8s ymm22, zmmword ptr [2*rbp - 2048] + vcvtph2bf8s ymm22, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtneph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: vcvtph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x74,0x71,0x7f] - vcvtneph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + vcvtph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] -// CHECK: vcvtneph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: vcvtph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x74,0x72,0x80] - vcvtneph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + vcvtph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} -// CHECK: vcvtneph2hf8 xmm22, xmm23 +// CHECK: vcvtph2hf8 xmm22, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xf7] - vcvtneph2hf8 xmm22, xmm23 + vcvtph2hf8 xmm22, xmm23 -// CHECK: vcvtneph2hf8 xmm22 {k7}, xmm23 +// CHECK: vcvtph2hf8 xmm22 {k7}, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x18,0xf7] - vcvtneph2hf8 xmm22 {k7}, xmm23 + vcvtph2hf8 xmm22 {k7}, xmm23 -// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, xmm23 +// CHECK: vcvtph2hf8 xmm22 {k7} {z}, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x18,0xf7] - vcvtneph2hf8 xmm22 {k7} {z}, xmm23 + vcvtph2hf8 xmm22 {k7} {z}, xmm23 -// CHECK: vcvtneph2hf8 ymm22, zmm23 +// CHECK: vcvtph2hf8 ymm22, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xf7] - vcvtneph2hf8 ymm22, zmm23 + vcvtph2hf8 ymm22, zmm23 -// CHECK: vcvtneph2hf8 ymm22 {k7}, zmm23 +// CHECK: vcvtph2hf8 ymm22 {k7}, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x18,0xf7] - vcvtneph2hf8 ymm22 {k7}, zmm23 + vcvtph2hf8 ymm22 {k7}, zmm23 -// CHECK: vcvtneph2hf8 ymm22 {k7} {z}, zmm23 +// CHECK: vcvtph2hf8 ymm22 {k7} {z}, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x18,0xf7] - vcvtneph2hf8 ymm22 {k7} {z}, zmm23 + vcvtph2hf8 ymm22 {k7} {z}, zmm23 -// CHECK: vcvtneph2hf8 xmm22, ymm23 +// CHECK: vcvtph2hf8 xmm22, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x18,0xf7] - vcvtneph2hf8 xmm22, ymm23 + vcvtph2hf8 xmm22, ymm23 -// CHECK: vcvtneph2hf8 xmm22 {k7}, ymm23 +// CHECK: vcvtph2hf8 xmm22 {k7}, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x18,0xf7] - vcvtneph2hf8 xmm22 {k7}, ymm23 + vcvtph2hf8 xmm22 {k7}, ymm23 -// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, ymm23 +// CHECK: vcvtph2hf8 xmm22 {k7} {z}, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x18,0xf7] - vcvtneph2hf8 xmm22 {k7} {z}, ymm23 + vcvtph2hf8 xmm22 {k7} {z}, ymm23 -// CHECK: vcvtneph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + vcvtph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2hf8 xmm22, word ptr [rip]{1to8} +// CHECK: vcvtph2hf8 xmm22, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8 xmm22, word ptr [rip]{1to8} + vcvtph2hf8 xmm22, word ptr [rip]{1to8} -// CHECK: vcvtneph2hf8 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: vcvtph2hf8 xmm22, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8 xmm22, xmmword ptr [2*rbp - 512] + vcvtph2hf8 xmm22, xmmword ptr [2*rbp - 512] -// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: vcvtph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x18,0x71,0x7f] - vcvtneph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + vcvtph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] -// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: vcvtph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x18,0x72,0x80] - vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + vcvtph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} -// CHECK: vcvtneph2hf8 xmm22, word ptr [rip]{1to16} +// CHECK: vcvtph2hf8 xmm22, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8 xmm22, word ptr [rip]{1to16} + vcvtph2hf8 xmm22, word ptr [rip]{1to16} -// CHECK: vcvtneph2hf8 xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: vcvtph2hf8 xmm22, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8 xmm22, ymmword ptr [2*rbp - 1024] + vcvtph2hf8 xmm22, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: vcvtph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x18,0x71,0x7f] - vcvtneph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + vcvtph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] -// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: vcvtph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x18,0x72,0x80] - vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + vcvtph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} -// CHECK: vcvtneph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + vcvtph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2hf8 ymm22, word ptr [rip]{1to32} +// CHECK: vcvtph2hf8 ymm22, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x18,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8 ymm22, word ptr [rip]{1to32} + vcvtph2hf8 ymm22, word ptr [rip]{1to32} -// CHECK: vcvtneph2hf8 ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: vcvtph2hf8 ymm22, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8 ymm22, zmmword ptr [2*rbp - 2048] + vcvtph2hf8 ymm22, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtneph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: vcvtph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x18,0x71,0x7f] - vcvtneph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + vcvtph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] -// CHECK: vcvtneph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: vcvtph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x18,0x72,0x80] - vcvtneph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + vcvtph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} -// CHECK: vcvtneph2hf8s xmm22, xmm23 +// CHECK: vcvtph2hf8s xmm22, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xf7] - vcvtneph2hf8s xmm22, xmm23 + vcvtph2hf8s xmm22, xmm23 -// CHECK: vcvtneph2hf8s xmm22 {k7}, xmm23 +// CHECK: vcvtph2hf8s xmm22 {k7}, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x1b,0xf7] - vcvtneph2hf8s xmm22 {k7}, xmm23 + vcvtph2hf8s xmm22 {k7}, xmm23 -// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, xmm23 +// CHECK: vcvtph2hf8s xmm22 {k7} {z}, xmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x1b,0xf7] - vcvtneph2hf8s xmm22 {k7} {z}, xmm23 + vcvtph2hf8s xmm22 {k7} {z}, xmm23 -// CHECK: vcvtneph2hf8s ymm22, zmm23 +// CHECK: vcvtph2hf8s ymm22, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xf7] - vcvtneph2hf8s ymm22, zmm23 + vcvtph2hf8s ymm22, zmm23 -// CHECK: vcvtneph2hf8s ymm22 {k7}, zmm23 +// CHECK: vcvtph2hf8s ymm22 {k7}, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x1b,0xf7] - vcvtneph2hf8s ymm22 {k7}, zmm23 + vcvtph2hf8s ymm22 {k7}, zmm23 -// CHECK: vcvtneph2hf8s ymm22 {k7} {z}, zmm23 +// CHECK: vcvtph2hf8s ymm22 {k7} {z}, zmm23 // CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x1b,0xf7] - vcvtneph2hf8s ymm22 {k7} {z}, zmm23 + vcvtph2hf8s ymm22 {k7} {z}, zmm23 -// CHECK: vcvtneph2hf8s xmm22, ymm23 +// CHECK: vcvtph2hf8s xmm22, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x1b,0xf7] - vcvtneph2hf8s xmm22, ymm23 + vcvtph2hf8s xmm22, ymm23 -// CHECK: vcvtneph2hf8s xmm22 {k7}, ymm23 +// CHECK: vcvtph2hf8s xmm22 {k7}, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x1b,0xf7] - vcvtneph2hf8s xmm22 {k7}, ymm23 + vcvtph2hf8s xmm22 {k7}, ymm23 -// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, ymm23 +// CHECK: vcvtph2hf8s xmm22 {k7} {z}, ymm23 // CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x1b,0xf7] - vcvtneph2hf8s xmm22 {k7} {z}, ymm23 + vcvtph2hf8s xmm22 {k7} {z}, ymm23 -// CHECK: vcvtneph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + vcvtph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2hf8s xmm22, word ptr [rip]{1to8} +// CHECK: vcvtph2hf8s xmm22, word ptr [rip]{1to8} // CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8s xmm22, word ptr [rip]{1to8} + vcvtph2hf8s xmm22, word ptr [rip]{1to8} -// CHECK: vcvtneph2hf8s xmm22, xmmword ptr [2*rbp - 512] +// CHECK: vcvtph2hf8s xmm22, xmmword ptr [2*rbp - 512] // CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] - vcvtneph2hf8s xmm22, xmmword ptr [2*rbp - 512] + vcvtph2hf8s xmm22, xmmword ptr [2*rbp - 512] -// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: vcvtph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] // CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x1b,0x71,0x7f] - vcvtneph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + vcvtph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] -// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: vcvtph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x1b,0x72,0x80] - vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + vcvtph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} -// CHECK: vcvtneph2hf8s xmm22, word ptr [rip]{1to16} +// CHECK: vcvtph2hf8s xmm22, word ptr [rip]{1to16} // CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8s xmm22, word ptr [rip]{1to16} + vcvtph2hf8s xmm22, word ptr [rip]{1to16} -// CHECK: vcvtneph2hf8s xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: vcvtph2hf8s xmm22, ymmword ptr [2*rbp - 1024] // CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] - vcvtneph2hf8s xmm22, ymmword ptr [2*rbp - 1024] + vcvtph2hf8s xmm22, ymmword ptr [2*rbp - 1024] -// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: vcvtph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] // CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x1b,0x71,0x7f] - vcvtneph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + vcvtph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] -// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: vcvtph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x1b,0x72,0x80] - vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + vcvtph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} -// CHECK: vcvtneph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: vcvtph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] - vcvtneph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + vcvtph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] -// CHECK: vcvtneph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: vcvtph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] // CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] - vcvtneph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + vcvtph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] -// CHECK: vcvtneph2hf8s ymm22, word ptr [rip]{1to32} +// CHECK: vcvtph2hf8s ymm22, word ptr [rip]{1to32} // CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x1b,0x35,0x00,0x00,0x00,0x00] - vcvtneph2hf8s ymm22, word ptr [rip]{1to32} + vcvtph2hf8s ymm22, word ptr [rip]{1to32} -// CHECK: vcvtneph2hf8s ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: vcvtph2hf8s ymm22, zmmword ptr [2*rbp - 2048] // CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] - vcvtneph2hf8s ymm22, zmmword ptr [2*rbp - 2048] + vcvtph2hf8s ymm22, zmmword ptr [2*rbp - 2048] -// CHECK: vcvtneph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: vcvtph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] // CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x1b,0x71,0x7f] - vcvtneph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + vcvtph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] -// CHECK: vcvtneph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: vcvtph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} // CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x1b,0x72,0x80] - vcvtneph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + vcvtph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index 0a8c5b4b76e9d5..5df6adcf38024b 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -1197,18 +1197,6 @@ static const X86FoldTableEntry Table1[] = { {X86::VCVTHF82PHZ128rr, X86::VCVTHF82PHZ128rm, TB_NO_REVERSE}, {X86::VCVTHF82PHZ256rr, X86::VCVTHF82PHZ256rm, 0}, {X86::VCVTHF82PHZrr, X86::VCVTHF82PHZrm, 0}, - {X86::VCVTNEPH2BF8SZ128rr, X86::VCVTNEPH2BF8SZ128rm, 0}, - {X86::VCVTNEPH2BF8SZ256rr, X86::VCVTNEPH2BF8SZ256rm, 0}, - {X86::VCVTNEPH2BF8SZrr, X86::VCVTNEPH2BF8SZrm, 0}, - {X86::VCVTNEPH2BF8Z128rr, X86::VCVTNEPH2BF8Z128rm, 0}, - {X86::VCVTNEPH2BF8Z256rr, X86::VCVTNEPH2BF8Z256rm, 0}, - {X86::VCVTNEPH2BF8Zrr, X86::VCVTNEPH2BF8Zrm, 0}, - {X86::VCVTNEPH2HF8SZ128rr, X86::VCVTNEPH2HF8SZ128rm, 0}, - {X86::VCVTNEPH2HF8SZ256rr, X86::VCVTNEPH2HF8SZ256rm, 0}, - {X86::VCVTNEPH2HF8SZrr, X86::VCVTNEPH2HF8SZrm, 0}, - {X86::VCVTNEPH2HF8Z128rr, X86::VCVTNEPH2HF8Z128rm, 0}, - {X86::VCVTNEPH2HF8Z256rr, X86::VCVTNEPH2HF8Z256rm, 0}, - {X86::VCVTNEPH2HF8Zrr, X86::VCVTNEPH2HF8Zrm, 0}, {X86::VCVTNEPS2BF16Yrr, X86::VCVTNEPS2BF16Yrm, 0}, {X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rm, 0}, {X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rm, 0}, @@ -1236,9 +1224,21 @@ static const X86FoldTableEntry Table1[] = { {X86::VCVTPD2UQQZ128rr, X86::VCVTPD2UQQZ128rm, 0}, {X86::VCVTPD2UQQZ256rr, X86::VCVTPD2UQQZ256rm, 0}, {X86::VCVTPD2UQQZrr, X86::VCVTPD2UQQZrm, 0}, + {X86::VCVTPH2BF8SZ128rr, X86::VCVTPH2BF8SZ128rm, 0}, + {X86::VCVTPH2BF8SZ256rr, X86::VCVTPH2BF8SZ256rm, 0}, + {X86::VCVTPH2BF8SZrr, X86::VCVTPH2BF8SZrm, 0}, + {X86::VCVTPH2BF8Z128rr, X86::VCVTPH2BF8Z128rm, 0}, + {X86::VCVTPH2BF8Z256rr, X86::VCVTPH2BF8Z256rm, 0}, + {X86::VCVTPH2BF8Zrr, X86::VCVTPH2BF8Zrm, 0}, {X86::VCVTPH2DQZ128rr, X86::VCVTPH2DQZ128rm, TB_NO_REVERSE}, {X86::VCVTPH2DQZ256rr, X86::VCVTPH2DQZ256rm, 0}, {X86::VCVTPH2DQZrr, X86::VCVTPH2DQZrm, 0}, + {X86::VCVTPH2HF8SZ128rr, X86::VCVTPH2HF8SZ128rm, 0}, + {X86::VCVTPH2HF8SZ256rr, X86::VCVTPH2HF8SZ256rm, 0}, + {X86::VCVTPH2HF8SZrr, X86::VCVTPH2HF8SZrm, 0}, + {X86::VCVTPH2HF8Z128rr, X86::VCVTPH2HF8Z128rm, 0}, + {X86::VCVTPH2HF8Z256rr, X86::VCVTPH2HF8Z256rm, 0}, + {X86::VCVTPH2HF8Zrr, X86::VCVTPH2HF8Zrm, 0}, {X86::VCVTPH2IBSZ128rr, X86::VCVTPH2IBSZ128rm, 0}, {X86::VCVTPH2IBSZ256rr, X86::VCVTPH2IBSZ256rm, 0}, {X86::VCVTPH2IBSZrr, X86::VCVTPH2IBSZrm, 0}, @@ -2538,6 +2538,18 @@ static const X86FoldTableEntry Table2[] = { {X86::VCMPSSZrri_Int, X86::VCMPSSZrmi_Int, TB_NO_REVERSE}, {X86::VCMPSSrri, X86::VCMPSSrmi, 0}, {X86::VCMPSSrri_Int, X86::VCMPSSrmi_Int, TB_NO_REVERSE}, + {X86::VCVT2PH2BF8SZ128rr, X86::VCVT2PH2BF8SZ128rm, 0}, + {X86::VCVT2PH2BF8SZ256rr, X86::VCVT2PH2BF8SZ256rm, 0}, + {X86::VCVT2PH2BF8SZrr, X86::VCVT2PH2BF8SZrm, 0}, + {X86::VCVT2PH2BF8Z128rr, X86::VCVT2PH2BF8Z128rm, 0}, + {X86::VCVT2PH2BF8Z256rr, X86::VCVT2PH2BF8Z256rm, 0}, + {X86::VCVT2PH2BF8Zrr, X86::VCVT2PH2BF8Zrm, 0}, + {X86::VCVT2PH2HF8SZ128rr, X86::VCVT2PH2HF8SZ128rm, 0}, + {X86::VCVT2PH2HF8SZ256rr, X86::VCVT2PH2HF8SZ256rm, 0}, + {X86::VCVT2PH2HF8SZrr, X86::VCVT2PH2HF8SZrm, 0}, + {X86::VCVT2PH2HF8Z128rr, X86::VCVT2PH2HF8Z128rm, 0}, + {X86::VCVT2PH2HF8Z256rr, X86::VCVT2PH2HF8Z256rm, 0}, + {X86::VCVT2PH2HF8Zrr, X86::VCVT2PH2HF8Zrm, 0}, {X86::VCVT2PS2PHXZ128rr, X86::VCVT2PS2PHXZ128rm, 0}, {X86::VCVT2PS2PHXZ256rr, X86::VCVT2PS2PHXZ256rm, 0}, {X86::VCVT2PS2PHXZrr, X86::VCVT2PS2PHXZrm, 0}, @@ -2571,33 +2583,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VCVTHF82PHZ128rrkz, X86::VCVTHF82PHZ128rmkz, TB_NO_REVERSE}, {X86::VCVTHF82PHZ256rrkz, X86::VCVTHF82PHZ256rmkz, 0}, {X86::VCVTHF82PHZrrkz, X86::VCVTHF82PHZrmkz, 0}, - {X86::VCVTNE2PH2BF8SZ128rr, X86::VCVTNE2PH2BF8SZ128rm, 0}, - {X86::VCVTNE2PH2BF8SZ256rr, X86::VCVTNE2PH2BF8SZ256rm, 0}, - {X86::VCVTNE2PH2BF8SZrr, X86::VCVTNE2PH2BF8SZrm, 0}, - {X86::VCVTNE2PH2BF8Z128rr, X86::VCVTNE2PH2BF8Z128rm, 0}, - {X86::VCVTNE2PH2BF8Z256rr, X86::VCVTNE2PH2BF8Z256rm, 0}, - {X86::VCVTNE2PH2BF8Zrr, X86::VCVTNE2PH2BF8Zrm, 0}, - {X86::VCVTNE2PH2HF8SZ128rr, X86::VCVTNE2PH2HF8SZ128rm, 0}, - {X86::VCVTNE2PH2HF8SZ256rr, X86::VCVTNE2PH2HF8SZ256rm, 0}, - {X86::VCVTNE2PH2HF8SZrr, X86::VCVTNE2PH2HF8SZrm, 0}, - {X86::VCVTNE2PH2HF8Z128rr, X86::VCVTNE2PH2HF8Z128rm, 0}, - {X86::VCVTNE2PH2HF8Z256rr, X86::VCVTNE2PH2HF8Z256rm, 0}, - {X86::VCVTNE2PH2HF8Zrr, X86::VCVTNE2PH2HF8Zrm, 0}, {X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rm, 0}, {X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rm, 0}, {X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrm, 0}, - {X86::VCVTNEPH2BF8SZ128rrkz, X86::VCVTNEPH2BF8SZ128rmkz, 0}, - {X86::VCVTNEPH2BF8SZ256rrkz, X86::VCVTNEPH2BF8SZ256rmkz, 0}, - {X86::VCVTNEPH2BF8SZrrkz, X86::VCVTNEPH2BF8SZrmkz, 0}, - {X86::VCVTNEPH2BF8Z128rrkz, X86::VCVTNEPH2BF8Z128rmkz, 0}, - {X86::VCVTNEPH2BF8Z256rrkz, X86::VCVTNEPH2BF8Z256rmkz, 0}, - {X86::VCVTNEPH2BF8Zrrkz, X86::VCVTNEPH2BF8Zrmkz, 0}, - {X86::VCVTNEPH2HF8SZ128rrkz, X86::VCVTNEPH2HF8SZ128rmkz, 0}, - {X86::VCVTNEPH2HF8SZ256rrkz, X86::VCVTNEPH2HF8SZ256rmkz, 0}, - {X86::VCVTNEPH2HF8SZrrkz, X86::VCVTNEPH2HF8SZrmkz, 0}, - {X86::VCVTNEPH2HF8Z128rrkz, X86::VCVTNEPH2HF8Z128rmkz, 0}, - {X86::VCVTNEPH2HF8Z256rrkz, X86::VCVTNEPH2HF8Z256rmkz, 0}, - {X86::VCVTNEPH2HF8Zrrkz, X86::VCVTNEPH2HF8Zrmkz, 0}, {X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmkz, 0}, {X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmkz, 0}, {X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmkz, 0}, @@ -2619,9 +2607,21 @@ static const X86FoldTableEntry Table2[] = { {X86::VCVTPD2UQQZ128rrkz, X86::VCVTPD2UQQZ128rmkz, 0}, {X86::VCVTPD2UQQZ256rrkz, X86::VCVTPD2UQQZ256rmkz, 0}, {X86::VCVTPD2UQQZrrkz, X86::VCVTPD2UQQZrmkz, 0}, + {X86::VCVTPH2BF8SZ128rrkz, X86::VCVTPH2BF8SZ128rmkz, 0}, + {X86::VCVTPH2BF8SZ256rrkz, X86::VCVTPH2BF8SZ256rmkz, 0}, + {X86::VCVTPH2BF8SZrrkz, X86::VCVTPH2BF8SZrmkz, 0}, + {X86::VCVTPH2BF8Z128rrkz, X86::VCVTPH2BF8Z128rmkz, 0}, + {X86::VCVTPH2BF8Z256rrkz, X86::VCVTPH2BF8Z256rmkz, 0}, + {X86::VCVTPH2BF8Zrrkz, X86::VCVTPH2BF8Zrmkz, 0}, {X86::VCVTPH2DQZ128rrkz, X86::VCVTPH2DQZ128rmkz, TB_NO_REVERSE}, {X86::VCVTPH2DQZ256rrkz, X86::VCVTPH2DQZ256rmkz, 0}, {X86::VCVTPH2DQZrrkz, X86::VCVTPH2DQZrmkz, 0}, + {X86::VCVTPH2HF8SZ128rrkz, X86::VCVTPH2HF8SZ128rmkz, 0}, + {X86::VCVTPH2HF8SZ256rrkz, X86::VCVTPH2HF8SZ256rmkz, 0}, + {X86::VCVTPH2HF8SZrrkz, X86::VCVTPH2HF8SZrmkz, 0}, + {X86::VCVTPH2HF8Z128rrkz, X86::VCVTPH2HF8Z128rmkz, 0}, + {X86::VCVTPH2HF8Z256rrkz, X86::VCVTPH2HF8Z256rmkz, 0}, + {X86::VCVTPH2HF8Zrrkz, X86::VCVTPH2HF8Zrmkz, 0}, {X86::VCVTPH2IBSZ128rrkz, X86::VCVTPH2IBSZ128rmkz, 0}, {X86::VCVTPH2IBSZ256rrkz, X86::VCVTPH2IBSZ256rmkz, 0}, {X86::VCVTPH2IBSZrrkz, X86::VCVTPH2IBSZrmkz, 0}, @@ -4291,6 +4291,18 @@ static const X86FoldTableEntry Table3[] = { {X86::VCMPSDZrrik_Int, X86::VCMPSDZrmik_Int, TB_NO_REVERSE}, {X86::VCMPSHZrrik_Int, X86::VCMPSHZrmik_Int, TB_NO_REVERSE}, {X86::VCMPSSZrrik_Int, X86::VCMPSSZrmik_Int, TB_NO_REVERSE}, + {X86::VCVT2PH2BF8SZ128rrkz, X86::VCVT2PH2BF8SZ128rmkz, 0}, + {X86::VCVT2PH2BF8SZ256rrkz, X86::VCVT2PH2BF8SZ256rmkz, 0}, + {X86::VCVT2PH2BF8SZrrkz, X86::VCVT2PH2BF8SZrmkz, 0}, + {X86::VCVT2PH2BF8Z128rrkz, X86::VCVT2PH2BF8Z128rmkz, 0}, + {X86::VCVT2PH2BF8Z256rrkz, X86::VCVT2PH2BF8Z256rmkz, 0}, + {X86::VCVT2PH2BF8Zrrkz, X86::VCVT2PH2BF8Zrmkz, 0}, + {X86::VCVT2PH2HF8SZ128rrkz, X86::VCVT2PH2HF8SZ128rmkz, 0}, + {X86::VCVT2PH2HF8SZ256rrkz, X86::VCVT2PH2HF8SZ256rmkz, 0}, + {X86::VCVT2PH2HF8SZrrkz, X86::VCVT2PH2HF8SZrmkz, 0}, + {X86::VCVT2PH2HF8Z128rrkz, X86::VCVT2PH2HF8Z128rmkz, 0}, + {X86::VCVT2PH2HF8Z256rrkz, X86::VCVT2PH2HF8Z256rmkz, 0}, + {X86::VCVT2PH2HF8Zrrkz, X86::VCVT2PH2HF8Zrmkz, 0}, {X86::VCVT2PS2PHXZ128rrkz, X86::VCVT2PS2PHXZ128rmkz, 0}, {X86::VCVT2PS2PHXZ256rrkz, X86::VCVT2PS2PHXZ256rmkz, 0}, {X86::VCVT2PS2PHXZrrkz, X86::VCVT2PS2PHXZrmkz, 0}, @@ -4324,33 +4336,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VCVTHF82PHZ128rrk, X86::VCVTHF82PHZ128rmk, TB_NO_REVERSE}, {X86::VCVTHF82PHZ256rrk, X86::VCVTHF82PHZ256rmk, 0}, {X86::VCVTHF82PHZrrk, X86::VCVTHF82PHZrmk, 0}, - {X86::VCVTNE2PH2BF8SZ128rrkz, X86::VCVTNE2PH2BF8SZ128rmkz, 0}, - {X86::VCVTNE2PH2BF8SZ256rrkz, X86::VCVTNE2PH2BF8SZ256rmkz, 0}, - {X86::VCVTNE2PH2BF8SZrrkz, X86::VCVTNE2PH2BF8SZrmkz, 0}, - {X86::VCVTNE2PH2BF8Z128rrkz, X86::VCVTNE2PH2BF8Z128rmkz, 0}, - {X86::VCVTNE2PH2BF8Z256rrkz, X86::VCVTNE2PH2BF8Z256rmkz, 0}, - {X86::VCVTNE2PH2BF8Zrrkz, X86::VCVTNE2PH2BF8Zrmkz, 0}, - {X86::VCVTNE2PH2HF8SZ128rrkz, X86::VCVTNE2PH2HF8SZ128rmkz, 0}, - {X86::VCVTNE2PH2HF8SZ256rrkz, X86::VCVTNE2PH2HF8SZ256rmkz, 0}, - {X86::VCVTNE2PH2HF8SZrrkz, X86::VCVTNE2PH2HF8SZrmkz, 0}, - {X86::VCVTNE2PH2HF8Z128rrkz, X86::VCVTNE2PH2HF8Z128rmkz, 0}, - {X86::VCVTNE2PH2HF8Z256rrkz, X86::VCVTNE2PH2HF8Z256rmkz, 0}, - {X86::VCVTNE2PH2HF8Zrrkz, X86::VCVTNE2PH2HF8Zrmkz, 0}, {X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmkz, 0}, {X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmkz, 0}, {X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmkz, 0}, - {X86::VCVTNEPH2BF8SZ128rrk, X86::VCVTNEPH2BF8SZ128rmk, 0}, - {X86::VCVTNEPH2BF8SZ256rrk, X86::VCVTNEPH2BF8SZ256rmk, 0}, - {X86::VCVTNEPH2BF8SZrrk, X86::VCVTNEPH2BF8SZrmk, 0}, - {X86::VCVTNEPH2BF8Z128rrk, X86::VCVTNEPH2BF8Z128rmk, 0}, - {X86::VCVTNEPH2BF8Z256rrk, X86::VCVTNEPH2BF8Z256rmk, 0}, - {X86::VCVTNEPH2BF8Zrrk, X86::VCVTNEPH2BF8Zrmk, 0}, - {X86::VCVTNEPH2HF8SZ128rrk, X86::VCVTNEPH2HF8SZ128rmk, 0}, - {X86::VCVTNEPH2HF8SZ256rrk, X86::VCVTNEPH2HF8SZ256rmk, 0}, - {X86::VCVTNEPH2HF8SZrrk, X86::VCVTNEPH2HF8SZrmk, 0}, - {X86::VCVTNEPH2HF8Z128rrk, X86::VCVTNEPH2HF8Z128rmk, 0}, - {X86::VCVTNEPH2HF8Z256rrk, X86::VCVTNEPH2HF8Z256rmk, 0}, - {X86::VCVTNEPH2HF8Zrrk, X86::VCVTNEPH2HF8Zrmk, 0}, {X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmk, 0}, {X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmk, 0}, {X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmk, 0}, @@ -4372,9 +4360,21 @@ static const X86FoldTableEntry Table3[] = { {X86::VCVTPD2UQQZ128rrk, X86::VCVTPD2UQQZ128rmk, 0}, {X86::VCVTPD2UQQZ256rrk, X86::VCVTPD2UQQZ256rmk, 0}, {X86::VCVTPD2UQQZrrk, X86::VCVTPD2UQQZrmk, 0}, + {X86::VCVTPH2BF8SZ128rrk, X86::VCVTPH2BF8SZ128rmk, 0}, + {X86::VCVTPH2BF8SZ256rrk, X86::VCVTPH2BF8SZ256rmk, 0}, + {X86::VCVTPH2BF8SZrrk, X86::VCVTPH2BF8SZrmk, 0}, + {X86::VCVTPH2BF8Z128rrk, X86::VCVTPH2BF8Z128rmk, 0}, + {X86::VCVTPH2BF8Z256rrk, X86::VCVTPH2BF8Z256rmk, 0}, + {X86::VCVTPH2BF8Zrrk, X86::VCVTPH2BF8Zrmk, 0}, {X86::VCVTPH2DQZ128rrk, X86::VCVTPH2DQZ128rmk, TB_NO_REVERSE}, {X86::VCVTPH2DQZ256rrk, X86::VCVTPH2DQZ256rmk, 0}, {X86::VCVTPH2DQZrrk, X86::VCVTPH2DQZrmk, 0}, + {X86::VCVTPH2HF8SZ128rrk, X86::VCVTPH2HF8SZ128rmk, 0}, + {X86::VCVTPH2HF8SZ256rrk, X86::VCVTPH2HF8SZ256rmk, 0}, + {X86::VCVTPH2HF8SZrrk, X86::VCVTPH2HF8SZrmk, 0}, + {X86::VCVTPH2HF8Z128rrk, X86::VCVTPH2HF8Z128rmk, 0}, + {X86::VCVTPH2HF8Z256rrk, X86::VCVTPH2HF8Z256rmk, 0}, + {X86::VCVTPH2HF8Zrrk, X86::VCVTPH2HF8Zrmk, 0}, {X86::VCVTPH2IBSZ128rrk, X86::VCVTPH2IBSZ128rmk, 0}, {X86::VCVTPH2IBSZ256rrk, X86::VCVTPH2IBSZ256rmk, 0}, {X86::VCVTPH2IBSZrrk, X86::VCVTPH2IBSZrmk, 0}, @@ -6110,6 +6110,18 @@ static const X86FoldTableEntry Table4[] = { {X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0}, {X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0}, {X86::VANDPSZrrk, X86::VANDPSZrmk, 0}, + {X86::VCVT2PH2BF8SZ128rrk, X86::VCVT2PH2BF8SZ128rmk, 0}, + {X86::VCVT2PH2BF8SZ256rrk, X86::VCVT2PH2BF8SZ256rmk, 0}, + {X86::VCVT2PH2BF8SZrrk, X86::VCVT2PH2BF8SZrmk, 0}, + {X86::VCVT2PH2BF8Z128rrk, X86::VCVT2PH2BF8Z128rmk, 0}, + {X86::VCVT2PH2BF8Z256rrk, X86::VCVT2PH2BF8Z256rmk, 0}, + {X86::VCVT2PH2BF8Zrrk, X86::VCVT2PH2BF8Zrmk, 0}, + {X86::VCVT2PH2HF8SZ128rrk, X86::VCVT2PH2HF8SZ128rmk, 0}, + {X86::VCVT2PH2HF8SZ256rrk, X86::VCVT2PH2HF8SZ256rmk, 0}, + {X86::VCVT2PH2HF8SZrrk, X86::VCVT2PH2HF8SZrmk, 0}, + {X86::VCVT2PH2HF8Z128rrk, X86::VCVT2PH2HF8Z128rmk, 0}, + {X86::VCVT2PH2HF8Z256rrk, X86::VCVT2PH2HF8Z256rmk, 0}, + {X86::VCVT2PH2HF8Zrrk, X86::VCVT2PH2HF8Zrmk, 0}, {X86::VCVT2PS2PHXZ128rrk, X86::VCVT2PS2PHXZ128rmk, 0}, {X86::VCVT2PS2PHXZ256rrk, X86::VCVT2PS2PHXZ256rmk, 0}, {X86::VCVT2PS2PHXZrrk, X86::VCVT2PS2PHXZrmk, 0}, @@ -6125,18 +6137,6 @@ static const X86FoldTableEntry Table4[] = { {X86::VCVTBIASPH2HF8Z128rrk, X86::VCVTBIASPH2HF8Z128rmk, 0}, {X86::VCVTBIASPH2HF8Z256rrk, X86::VCVTBIASPH2HF8Z256rmk, 0}, {X86::VCVTBIASPH2HF8Zrrk, X86::VCVTBIASPH2HF8Zrmk, 0}, - {X86::VCVTNE2PH2BF8SZ128rrk, X86::VCVTNE2PH2BF8SZ128rmk, 0}, - {X86::VCVTNE2PH2BF8SZ256rrk, X86::VCVTNE2PH2BF8SZ256rmk, 0}, - {X86::VCVTNE2PH2BF8SZrrk, X86::VCVTNE2PH2BF8SZrmk, 0}, - {X86::VCVTNE2PH2BF8Z128rrk, X86::VCVTNE2PH2BF8Z128rmk, 0}, - {X86::VCVTNE2PH2BF8Z256rrk, X86::VCVTNE2PH2BF8Z256rmk, 0}, - {X86::VCVTNE2PH2BF8Zrrk, X86::VCVTNE2PH2BF8Zrmk, 0}, - {X86::VCVTNE2PH2HF8SZ128rrk, X86::VCVTNE2PH2HF8SZ128rmk, 0}, - {X86::VCVTNE2PH2HF8SZ256rrk, X86::VCVTNE2PH2HF8SZ256rmk, 0}, - {X86::VCVTNE2PH2HF8SZrrk, X86::VCVTNE2PH2HF8SZrmk, 0}, - {X86::VCVTNE2PH2HF8Z128rrk, X86::VCVTNE2PH2HF8Z128rmk, 0}, - {X86::VCVTNE2PH2HF8Z256rrk, X86::VCVTNE2PH2HF8Z256rmk, 0}, - {X86::VCVTNE2PH2HF8Zrrk, X86::VCVTNE2PH2HF8Zrmk, 0}, {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0}, {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0}, {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0}, @@ -7438,18 +7438,6 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VCVTDQ2PSZ128rr, X86::VCVTDQ2PSZ128rmb, TB_BCAST_D}, {X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rmb, TB_BCAST_D}, {X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrmb, TB_BCAST_D}, - {X86::VCVTNEPH2BF8SZ128rr, X86::VCVTNEPH2BF8SZ128rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8SZ256rr, X86::VCVTNEPH2BF8SZ256rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8SZrr, X86::VCVTNEPH2BF8SZrmb, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Z128rr, X86::VCVTNEPH2BF8Z128rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Z256rr, X86::VCVTNEPH2BF8Z256rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Zrr, X86::VCVTNEPH2BF8Zrmb, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZ128rr, X86::VCVTNEPH2HF8SZ128rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZ256rr, X86::VCVTNEPH2HF8SZ256rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZrr, X86::VCVTNEPH2HF8SZrmb, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Z128rr, X86::VCVTNEPH2HF8Z128rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Z256rr, X86::VCVTNEPH2HF8Z256rmb, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Zrr, X86::VCVTNEPH2HF8Zrmb, TB_BCAST_SH}, {X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rmb, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rmb, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrmb, TB_BCAST_SS}, @@ -7471,9 +7459,21 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VCVTPD2UQQZ128rr, X86::VCVTPD2UQQZ128rmb, TB_BCAST_SD}, {X86::VCVTPD2UQQZ256rr, X86::VCVTPD2UQQZ256rmb, TB_BCAST_SD}, {X86::VCVTPD2UQQZrr, X86::VCVTPD2UQQZrmb, TB_BCAST_SD}, + {X86::VCVTPH2BF8SZ128rr, X86::VCVTPH2BF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTPH2BF8SZ256rr, X86::VCVTPH2BF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTPH2BF8SZrr, X86::VCVTPH2BF8SZrmb, TB_BCAST_SH}, + {X86::VCVTPH2BF8Z128rr, X86::VCVTPH2BF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTPH2BF8Z256rr, X86::VCVTPH2BF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTPH2BF8Zrr, X86::VCVTPH2BF8Zrmb, TB_BCAST_SH}, {X86::VCVTPH2DQZ128rr, X86::VCVTPH2DQZ128rmb, TB_BCAST_SH}, {X86::VCVTPH2DQZ256rr, X86::VCVTPH2DQZ256rmb, TB_BCAST_SH}, {X86::VCVTPH2DQZrr, X86::VCVTPH2DQZrmb, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZ128rr, X86::VCVTPH2HF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZ256rr, X86::VCVTPH2HF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZrr, X86::VCVTPH2HF8SZrmb, TB_BCAST_SH}, + {X86::VCVTPH2HF8Z128rr, X86::VCVTPH2HF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTPH2HF8Z256rr, X86::VCVTPH2HF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTPH2HF8Zrr, X86::VCVTPH2HF8Zrmb, TB_BCAST_SH}, {X86::VCVTPH2IBSZ128rr, X86::VCVTPH2IBSZ128rmb, TB_BCAST_SH}, {X86::VCVTPH2IBSZ256rr, X86::VCVTPH2IBSZ256rmb, TB_BCAST_SH}, {X86::VCVTPH2IBSZrr, X86::VCVTPH2IBSZrmb, TB_BCAST_SH}, @@ -7862,6 +7862,18 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS}, {X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS}, {X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS}, + {X86::VCVT2PH2BF8SZ128rr, X86::VCVT2PH2BF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVT2PH2BF8SZ256rr, X86::VCVT2PH2BF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVT2PH2BF8SZrr, X86::VCVT2PH2BF8SZrmb, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Z128rr, X86::VCVT2PH2BF8Z128rmb, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Z256rr, X86::VCVT2PH2BF8Z256rmb, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Zrr, X86::VCVT2PH2BF8Zrmb, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZ128rr, X86::VCVT2PH2HF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZ256rr, X86::VCVT2PH2HF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZrr, X86::VCVT2PH2HF8SZrmb, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Z128rr, X86::VCVT2PH2HF8Z128rmb, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Z256rr, X86::VCVT2PH2HF8Z256rmb, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Zrr, X86::VCVT2PH2HF8Zrmb, TB_BCAST_SH}, {X86::VCVT2PS2PHXZ128rr, X86::VCVT2PS2PHXZ128rmb, TB_BCAST_SS}, {X86::VCVT2PS2PHXZ256rr, X86::VCVT2PS2PHXZ256rmb, TB_BCAST_SS}, {X86::VCVT2PS2PHXZrr, X86::VCVT2PS2PHXZrmb, TB_BCAST_SS}, @@ -7892,33 +7904,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmbkz, TB_BCAST_D}, {X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmbkz, TB_BCAST_D}, {X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmbkz, TB_BCAST_D}, - {X86::VCVTNE2PH2BF8SZ128rr, X86::VCVTNE2PH2BF8SZ128rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8SZ256rr, X86::VCVTNE2PH2BF8SZ256rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8SZrr, X86::VCVTNE2PH2BF8SZrmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Z128rr, X86::VCVTNE2PH2BF8Z128rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Z256rr, X86::VCVTNE2PH2BF8Z256rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Zrr, X86::VCVTNE2PH2BF8Zrmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZ128rr, X86::VCVTNE2PH2HF8SZ128rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZ256rr, X86::VCVTNE2PH2HF8SZ256rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZrr, X86::VCVTNE2PH2HF8SZrmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Z128rr, X86::VCVTNE2PH2HF8Z128rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Z256rr, X86::VCVTNE2PH2HF8Z256rmb, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Zrr, X86::VCVTNE2PH2HF8Zrmb, TB_BCAST_SH}, {X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rmb, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rmb, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrmb, TB_BCAST_SS}, - {X86::VCVTNEPH2BF8SZ128rrkz, X86::VCVTNEPH2BF8SZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8SZ256rrkz, X86::VCVTNEPH2BF8SZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8SZrrkz, X86::VCVTNEPH2BF8SZrmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Z128rrkz, X86::VCVTNEPH2BF8Z128rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Z256rrkz, X86::VCVTNEPH2BF8Z256rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Zrrkz, X86::VCVTNEPH2BF8Zrmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZ128rrkz, X86::VCVTNEPH2HF8SZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZ256rrkz, X86::VCVTNEPH2HF8SZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZrrkz, X86::VCVTNEPH2HF8SZrmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Z128rrkz, X86::VCVTNEPH2HF8Z128rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Z256rrkz, X86::VCVTNEPH2HF8Z256rmbkz, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Zrrkz, X86::VCVTNEPH2HF8Zrmbkz, TB_BCAST_SH}, {X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmbkz, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmbkz, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmbkz, TB_BCAST_SS}, @@ -7940,9 +7928,21 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTPD2UQQZ128rrkz, X86::VCVTPD2UQQZ128rmbkz, TB_BCAST_SD}, {X86::VCVTPD2UQQZ256rrkz, X86::VCVTPD2UQQZ256rmbkz, TB_BCAST_SD}, {X86::VCVTPD2UQQZrrkz, X86::VCVTPD2UQQZrmbkz, TB_BCAST_SD}, + {X86::VCVTPH2BF8SZ128rrkz, X86::VCVTPH2BF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2BF8SZ256rrkz, X86::VCVTPH2BF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2BF8SZrrkz, X86::VCVTPH2BF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTPH2BF8Z128rrkz, X86::VCVTPH2BF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2BF8Z256rrkz, X86::VCVTPH2BF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2BF8Zrrkz, X86::VCVTPH2BF8Zrmbkz, TB_BCAST_SH}, {X86::VCVTPH2DQZ128rrkz, X86::VCVTPH2DQZ128rmbkz, TB_BCAST_SH}, {X86::VCVTPH2DQZ256rrkz, X86::VCVTPH2DQZ256rmbkz, TB_BCAST_SH}, {X86::VCVTPH2DQZrrkz, X86::VCVTPH2DQZrmbkz, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZ128rrkz, X86::VCVTPH2HF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZ256rrkz, X86::VCVTPH2HF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZrrkz, X86::VCVTPH2HF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTPH2HF8Z128rrkz, X86::VCVTPH2HF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2HF8Z256rrkz, X86::VCVTPH2HF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTPH2HF8Zrrkz, X86::VCVTPH2HF8Zrmbkz, TB_BCAST_SH}, {X86::VCVTPH2IBSZ128rrkz, X86::VCVTPH2IBSZ128rmbkz, TB_BCAST_SH}, {X86::VCVTPH2IBSZ256rrkz, X86::VCVTPH2IBSZ256rmbkz, TB_BCAST_SH}, {X86::VCVTPH2IBSZrrkz, X86::VCVTPH2IBSZrmbkz, TB_BCAST_SH}, @@ -8686,6 +8686,18 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmbik, TB_BCAST_SS}, {X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmbik, TB_BCAST_SS}, {X86::VCMPPSZrrik, X86::VCMPPSZrmbik, TB_BCAST_SS}, + {X86::VCVT2PH2BF8SZ128rrkz, X86::VCVT2PH2BF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2BF8SZ256rrkz, X86::VCVT2PH2BF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2BF8SZrrkz, X86::VCVT2PH2BF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Z128rrkz, X86::VCVT2PH2BF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Z256rrkz, X86::VCVT2PH2BF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Zrrkz, X86::VCVT2PH2BF8Zrmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZ128rrkz, X86::VCVT2PH2HF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZ256rrkz, X86::VCVT2PH2HF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZrrkz, X86::VCVT2PH2HF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Z128rrkz, X86::VCVT2PH2HF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Z256rrkz, X86::VCVT2PH2HF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Zrrkz, X86::VCVT2PH2HF8Zrmbkz, TB_BCAST_SH}, {X86::VCVT2PS2PHXZ128rrkz, X86::VCVT2PS2PHXZ128rmbkz, TB_BCAST_SS}, {X86::VCVT2PS2PHXZ256rrkz, X86::VCVT2PS2PHXZ256rmbkz, TB_BCAST_SS}, {X86::VCVT2PS2PHXZrrkz, X86::VCVT2PS2PHXZrmbkz, TB_BCAST_SS}, @@ -8716,33 +8728,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmbk, TB_BCAST_D}, {X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmbk, TB_BCAST_D}, {X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmbk, TB_BCAST_D}, - {X86::VCVTNE2PH2BF8SZ128rrkz, X86::VCVTNE2PH2BF8SZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8SZ256rrkz, X86::VCVTNE2PH2BF8SZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8SZrrkz, X86::VCVTNE2PH2BF8SZrmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Z128rrkz, X86::VCVTNE2PH2BF8Z128rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Z256rrkz, X86::VCVTNE2PH2BF8Z256rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Zrrkz, X86::VCVTNE2PH2BF8Zrmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZ128rrkz, X86::VCVTNE2PH2HF8SZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZ256rrkz, X86::VCVTNE2PH2HF8SZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZrrkz, X86::VCVTNE2PH2HF8SZrmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Z128rrkz, X86::VCVTNE2PH2HF8Z128rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Z256rrkz, X86::VCVTNE2PH2HF8Z256rmbkz, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Zrrkz, X86::VCVTNE2PH2HF8Zrmbkz, TB_BCAST_SH}, {X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmbkz, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmbkz, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmbkz, TB_BCAST_SS}, - {X86::VCVTNEPH2BF8SZ128rrk, X86::VCVTNEPH2BF8SZ128rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8SZ256rrk, X86::VCVTNEPH2BF8SZ256rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8SZrrk, X86::VCVTNEPH2BF8SZrmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Z128rrk, X86::VCVTNEPH2BF8Z128rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Z256rrk, X86::VCVTNEPH2BF8Z256rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2BF8Zrrk, X86::VCVTNEPH2BF8Zrmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZ128rrk, X86::VCVTNEPH2HF8SZ128rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZ256rrk, X86::VCVTNEPH2HF8SZ256rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8SZrrk, X86::VCVTNEPH2HF8SZrmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Z128rrk, X86::VCVTNEPH2HF8Z128rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Z256rrk, X86::VCVTNEPH2HF8Z256rmbk, TB_BCAST_SH}, - {X86::VCVTNEPH2HF8Zrrk, X86::VCVTNEPH2HF8Zrmbk, TB_BCAST_SH}, {X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmbk, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmbk, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmbk, TB_BCAST_SS}, @@ -8764,9 +8752,21 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTPD2UQQZ128rrk, X86::VCVTPD2UQQZ128rmbk, TB_BCAST_SD}, {X86::VCVTPD2UQQZ256rrk, X86::VCVTPD2UQQZ256rmbk, TB_BCAST_SD}, {X86::VCVTPD2UQQZrrk, X86::VCVTPD2UQQZrmbk, TB_BCAST_SD}, + {X86::VCVTPH2BF8SZ128rrk, X86::VCVTPH2BF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTPH2BF8SZ256rrk, X86::VCVTPH2BF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTPH2BF8SZrrk, X86::VCVTPH2BF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTPH2BF8Z128rrk, X86::VCVTPH2BF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTPH2BF8Z256rrk, X86::VCVTPH2BF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTPH2BF8Zrrk, X86::VCVTPH2BF8Zrmbk, TB_BCAST_SH}, {X86::VCVTPH2DQZ128rrk, X86::VCVTPH2DQZ128rmbk, TB_BCAST_SH}, {X86::VCVTPH2DQZ256rrk, X86::VCVTPH2DQZ256rmbk, TB_BCAST_SH}, {X86::VCVTPH2DQZrrk, X86::VCVTPH2DQZrmbk, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZ128rrk, X86::VCVTPH2HF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZ256rrk, X86::VCVTPH2HF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTPH2HF8SZrrk, X86::VCVTPH2HF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTPH2HF8Z128rrk, X86::VCVTPH2HF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTPH2HF8Z256rrk, X86::VCVTPH2HF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTPH2HF8Zrrk, X86::VCVTPH2HF8Zrmbk, TB_BCAST_SH}, {X86::VCVTPH2IBSZ128rrk, X86::VCVTPH2IBSZ128rmbk, TB_BCAST_SH}, {X86::VCVTPH2IBSZ256rrk, X86::VCVTPH2IBSZ256rmbk, TB_BCAST_SH}, {X86::VCVTPH2IBSZrrk, X86::VCVTPH2IBSZrmbk, TB_BCAST_SH}, @@ -9786,6 +9786,18 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VANDPSZ128rrk, X86::VANDPSZ128rmbk, TB_BCAST_SS}, {X86::VANDPSZ256rrk, X86::VANDPSZ256rmbk, TB_BCAST_SS}, {X86::VANDPSZrrk, X86::VANDPSZrmbk, TB_BCAST_SS}, + {X86::VCVT2PH2BF8SZ128rrk, X86::VCVT2PH2BF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2BF8SZ256rrk, X86::VCVT2PH2BF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2BF8SZrrk, X86::VCVT2PH2BF8SZrmbk, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Z128rrk, X86::VCVT2PH2BF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Z256rrk, X86::VCVT2PH2BF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2BF8Zrrk, X86::VCVT2PH2BF8Zrmbk, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZ128rrk, X86::VCVT2PH2HF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZ256rrk, X86::VCVT2PH2HF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2HF8SZrrk, X86::VCVT2PH2HF8SZrmbk, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Z128rrk, X86::VCVT2PH2HF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Z256rrk, X86::VCVT2PH2HF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVT2PH2HF8Zrrk, X86::VCVT2PH2HF8Zrmbk, TB_BCAST_SH}, {X86::VCVT2PS2PHXZ128rrk, X86::VCVT2PS2PHXZ128rmbk, TB_BCAST_SS}, {X86::VCVT2PS2PHXZ256rrk, X86::VCVT2PS2PHXZ256rmbk, TB_BCAST_SS}, {X86::VCVT2PS2PHXZrrk, X86::VCVT2PS2PHXZrmbk, TB_BCAST_SS}, @@ -9801,18 +9813,6 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VCVTBIASPH2HF8Z128rrk, X86::VCVTBIASPH2HF8Z128rmbk, TB_BCAST_SH}, {X86::VCVTBIASPH2HF8Z256rrk, X86::VCVTBIASPH2HF8Z256rmbk, TB_BCAST_SH}, {X86::VCVTBIASPH2HF8Zrrk, X86::VCVTBIASPH2HF8Zrmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8SZ128rrk, X86::VCVTNE2PH2BF8SZ128rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8SZ256rrk, X86::VCVTNE2PH2BF8SZ256rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8SZrrk, X86::VCVTNE2PH2BF8SZrmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Z128rrk, X86::VCVTNE2PH2BF8Z128rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Z256rrk, X86::VCVTNE2PH2BF8Z256rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2BF8Zrrk, X86::VCVTNE2PH2BF8Zrmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZ128rrk, X86::VCVTNE2PH2HF8SZ128rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZ256rrk, X86::VCVTNE2PH2HF8SZ256rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8SZrrk, X86::VCVTNE2PH2HF8SZrmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Z128rrk, X86::VCVTNE2PH2HF8Z128rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Z256rrk, X86::VCVTNE2PH2HF8Z256rmbk, TB_BCAST_SH}, - {X86::VCVTNE2PH2HF8Zrrk, X86::VCVTNE2PH2HF8Zrmbk, TB_BCAST_SH}, {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmbk, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmbk, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmbk, TB_BCAST_SS}, From 1f0964f81e5ae90e1c50fcdd103ec9c838b995e0 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 23 Jan 2025 14:51:18 +0000 Subject: [PATCH 148/208] [llvm][Docs] Clarify finding maintainers By noting where the files are to be found, and adding some whitespace to break up large blocks. (the merge on behalf bit needs a refresh but this will go into review later after this) --- llvm/docs/Contributing.rst | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/llvm/docs/Contributing.rst b/llvm/docs/Contributing.rst index cf48c66dc0d068..1200ee987e9d94 100644 --- a/llvm/docs/Contributing.rst +++ b/llvm/docs/Contributing.rst @@ -94,13 +94,17 @@ For more information about the workflow of using GitHub Pull Requests see our `LLVM's Phabricator `_ instance. To make sure the right people see your patch, please select suitable reviewers -and add them to your patch when requesting a review. Suitable reviewers are the -maintainers (see ``Maintainers.rst``) and other people doing work in the area your -patch touches. Github will normally suggest some reviewers based on rules or -people that have worked on the code before. If you are a new contributor, you -will not be able to select reviewers in such a way, in which case you can still -get the attention of potential reviewers by CC'ing them in a comment -- just -@name them. +and add them to your patch when requesting a review. + +Suitable reviewers are the maintainers of the project you are modifying, and +anyone else working in the area your patch touches. To find maintainers, look for +the ``Maintainers.md`` or ``Maintainers.rst`` file in the root of the project's +sub-directory. For example, LLVM's is ``llvm/Maintainers.md`` and Clang's is +``clang/Maintainers.rst``. + +If you are a new contributor, you will not be able to select reviewers in such a +way, in which case you can still get the attention of potential reviewers by CC'ing +them in a comment -- just @name them. If you have received no comments on your patch for a week, you can request a review by 'ping'ing the GitHub PR with "Ping". The common courtesy 'ping' rate From 26b61e143b7e6117b57df2b58bbcb146a6f0f4d4 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Thu, 23 Jan 2025 14:57:31 +0000 Subject: [PATCH 149/208] [LoopVectorizer] Propagate underlying instruction to the cloned instances of VPPartialReductionRecipes (#123638) --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 +- .../LoopVectorize/AArch64/vplan-printing.ll | 68 ++++++++++++++++++- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 11ba7f06735134..8d3a2eaee2eff0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2467,7 +2467,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { ~VPPartialReductionRecipe() override = default; VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1)); + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), + getUnderlyingInstr()); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index ccf8540b4ebf7c..32ecedc535b4db 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -disable-output %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -70,7 +70,71 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block) ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; +; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' { +; CHECK-NEXT: Live-in ir<[[EP_VFxUF:.+]]> = VF * UF +; CHECK-NEXT: Live-in ir<[[EP_VEC_TC:.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: SCALAR-PHI vp<[[EP_IV:%.+]]> = phi ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4) +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[EP_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> +; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<%accum> +; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> +; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add> +; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, ir<1024> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from ir-bb) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[EP_RESUME:%.+]]> = resume-phi ir<1024>, ir<0> +; CHECK-NEXT: EMIT vp<[[EP_MERGE:%.+]]> = resume-phi vp<[[RED_RESULT]]>, ir<0> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %accum = phi i32 [ 0, %scalar.ph ], [ %add, %for.body ] (extra operand: vp<[[EP_MERGE]]> from ir-bb) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = add i32 %mul, %accum +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 +; CHECK-NEXT: No successors +; CHECK-NEXT: } entry: br label %for.body From 6206f5444fc0732e6495703c75a67f1f90f5b418 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:07:57 +0100 Subject: [PATCH 150/208] [AMDGPU] Occupancy w.r.t. workgroup size range is also a range (#123748) Occupancy (i.e., the number of waves per EU) depends, in addition to register usage, on per-workgroup LDS usage as well as on the range of possible workgroup sizes. Mirroring the latter, occupancy should therefore be expressed as a range since different group sizes generally yield different achievable occupancies. `getOccupancyWithLocalMemSize` currently returns a scalar occupancy based on the maximum workgroup size and LDS usage. With respect to the workgroup size range, this scalar can be the minimum, the maximum, or neither of the two of the range of achievable occupancies. This commit fixes the function by making it compute and return the range of achievable occupancies w.r.t. workgroup size and LDS usage; it also renames it to `getOccupancyWithWorkGroupSizes` since it is the range of workgroup sizes that produces the range of achievable occupancies. Computing the achievable occupancy range is surprisingly involved. Minimum/maximum workgroup sizes do not necessarily yield maximum/minimum occupancies i.e., sometimes workgroup sizes inside the range yield the occupancy bounds. The implementation finds these sizes in constant time; heavy documentation explains the rationale behind the sometimes relatively obscure calculations. As a justifying example, consider a target with 10 waves / EU, 4 EUs/CU, 64-wide waves. Also consider a function with no LDS usage and a flat workgroup size range of [513,1024]. - A group of 513 items requires 9 waves per group. Only 4 groups made up of 9 waves each can fit fully on a CU at any given time, for a total of 36 waves on the CU, or 9 per EU. However, filling as much as possible the remaining 40-36=4 wave slots without decreasing the number of groups reveals that a larger group of 640 items yields 40 waves on the CU, or 10 per EU. - Similarly, a group of 1024 items requires 16 waves per group. Only 2 groups made up of 16 waves each can fit fully on a CU ay any given time, for a total of 32 waves on the CU, or 8 per EU. However, removing as many waves as possible from the groups without being able to fit another equal-sized group on the CU reveals that a smaller group of 896 items yields 28 waves on the CU, or 7 per EU. Therefore the achievable occupancy range for this function is not [8,9] as the group size bounds directly yield, but [7,10]. Naturally this change causes a lot of test churn as instruction scheduling is driven by achievable occupancy estimates. In most unit tests the flat workgroup size range is the default [1,1024] which, ignoring potential LDS limitations, would previously produce a scalar occupancy of 8 (derived from 1024) on a lot of targets, whereas we now consider the maximum occupancy to be 10 in such cases. Most tests are updated automatically and checked manually for sanity. I also manually changed some non-automatically generated assertions when necessary. Fixes #118220. --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 6 +- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 123 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 18 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 9 +- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 20 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 16 +- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 5 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 9 +- .../CodeGen/AMDGPU/GlobalISel/add.vni16.ll | 140 +- .../CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll | 336 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 434 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 406 +- .../AMDGPU/GlobalISel/insertelement.ll | 30 +- .../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll | 275 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 384 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 192 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 994 +-- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 395 +- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 1235 ++-- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 116 +- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 1284 ++-- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 463 +- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 1374 ++-- .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 44 +- llvm/test/CodeGen/AMDGPU/abs_i16.ll | 174 +- llvm/test/CodeGen/AMDGPU/add.ll | 64 +- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 460 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 13 +- .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 6164 ++++++++--------- .../test/CodeGen/AMDGPU/branch-relax-spill.ll | 8 +- ...ffer-fat-pointers-contents-legalization.ll | 126 +- .../CodeGen/AMDGPU/calling-conventions.ll | 6 +- .../AMDGPU/dbg-value-ends-sched-region.mir | 32 +- .../AMDGPU/debug-value-scheduler-crash.mir | 38 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 212 +- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 1995 +++--- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 53 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 649 +- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 560 +- llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 168 +- llvm/test/CodeGen/AMDGPU/function-args.ll | 1732 ++--- llvm/test/CodeGen/AMDGPU/function-returns.ll | 28 +- .../AMDGPU/gfx-callable-argument-types.ll | 12 +- .../AMDGPU/gfx-callable-return-types.ll | 190 +- llvm/test/CodeGen/AMDGPU/half.ll | 509 +- llvm/test/CodeGen/AMDGPU/idot8s.ll | 6 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 966 +-- .../AMDGPU/insert_vector_elt.v2bf16.ll | 246 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 224 +- .../CodeGen/AMDGPU/integer-mad-patterns.ll | 84 +- llvm/test/CodeGen/AMDGPU/licm-regpressure.mir | 16 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 324 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 198 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 150 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 198 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 97 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 1690 +++-- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 2106 +++--- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 819 ++- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 95 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 1472 ++-- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 2 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 2 +- .../machine-scheduler-sink-trivial-remats.mir | 160 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 4 +- llvm/test/CodeGen/AMDGPU/memory_clause.mir | 2 +- .../AMDGPU/min-waves-per-eu-not-respected.ll | 2 +- llvm/test/CodeGen/AMDGPU/mul.ll | 101 +- .../CodeGen/AMDGPU/mul24-pass-ordering.ll | 16 +- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 148 +- llvm/test/CodeGen/AMDGPU/pr51516.mir | 4 +- .../AMDGPU/promote-constOffset-to-imm.ll | 251 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 94 +- .../CodeGen/AMDGPU/remat-fp64-constants.ll | 4 +- .../AMDGPU/resource-optimization-remarks.ll | 6 +- llvm/test/CodeGen/AMDGPU/rsq.f64.ll | 218 +- ...dleMoveUp-subreg-def-across-subreg-def.mir | 4 +- .../AMDGPU/schedule-amdgpu-trackers.ll | 14 +- llvm/test/CodeGen/AMDGPU/schedule-barrier.mir | 18 +- .../schedule-regpressure-limit-clustering.ll | 2 +- .../AMDGPU/schedule-relaxed-occupancy.ll | 12 +- llvm/test/CodeGen/AMDGPU/sdiv.ll | 408 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 14 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 368 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 36 +- llvm/test/CodeGen/AMDGPU/shl.ll | 22 +- .../AMDGPU/shufflevector.v2i64.v4i64.ll | 14 +- .../AMDGPU/shufflevector.v2i64.v8i64.ll | 32 +- .../CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll | 14 +- .../AMDGPU/shufflevector.v3i64.v4i64.ll | 138 +- .../CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll | 138 +- .../AMDGPU/shufflevector.v4i64.v4i64.ll | 291 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 291 +- llvm/test/CodeGen/AMDGPU/sra.ll | 44 +- llvm/test/CodeGen/AMDGPU/srem.ll | 232 +- llvm/test/CodeGen/AMDGPU/srl.ll | 22 +- llvm/test/CodeGen/AMDGPU/ssubsat.ll | 20 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 36 +- ...ine-function-info-long-branch-reg-debug.ll | 2 +- .../machine-function-info-long-branch-reg.ll | 2 +- .../AMDGPU/machine-function-info-no-ir.mir | 20 +- .../MIR/AMDGPU/machine-function-info.ll | 4 +- 105 files changed, 16507 insertions(+), 16905 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 0c151d06924d8d..031d8f0560ff25 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -456,7 +456,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { uint64_t NumSGPRsForWavesPerEU = std::max( {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( - STM.computeOccupancy(F, MFI.getLDSSize()), + STM.getOccupancyWithWorkGroupSizes(*MF).second, MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM, OutContext); @@ -1272,8 +1272,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( - STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU, - ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); + STM.computeOccupancy(F, ProgInfo.LDSSize).second, + ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index e27ef71c1c0883..907f82ed7fc528 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } unsigned MaxOccupancy = - ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F); + ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second; // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index ae563df2a7a128..d98a0ffcaf7e38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -55,55 +55,90 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() / WorkGroupsPerCU; } -// FIXME: Should return min,max range. -// -// Returns the maximum occupancy, in number of waves per SIMD / EU, that can -// be achieved when only the given function is running on the machine; and -// taking into account the overall number of wave slots, the (maximum) workgroup -// size, and the per-workgroup LDS allocation size. -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, - const Function &F) const { - const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; - const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); - if (!MaxWorkGroupsPerCu) - return 0; - - const unsigned WaveSize = getWavefrontSize(); - - // FIXME: Do we need to account for alignment requirement of LDS rounding the - // size up? - // Compute restriction based on LDS usage - unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); - - // This can be queried with more LDS than is possible, so just assume the - // worst. - if (NumGroups == 0) - return 1; - - NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); - - // Round to the number of waves per CU. - const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize); - unsigned MaxWaves = NumGroups * MaxGroupNumWaves; - - // Number of waves per EU (SIMD). - MaxWaves = divideCeil(MaxWaves, getEUsPerCU()); - - // Clamp to the maximum possible number of waves. - MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); +std::pair +AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, + const Function &F) const { + // FIXME: We should take into account the LDS allocation granularity. + const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); + + // Queried LDS size may be larger than available on a CU, in which case we + // consider the only achievable occupancy to be 1, in line with what we + // consider the occupancy to be when the number of requested registers in a + // particular bank is higher than the number of available ones in that bank. + if (!MaxWGsLDS) + return {1, 1}; + + const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU(); + + auto PropsFromWGSize = [=](unsigned WGSize) + -> std::tuple { + unsigned WavesPerWG = divideCeil(WGSize, WaveSize); + unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS); + return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU}; + }; + + // The maximum group size will generally yield the minimum number of + // workgroups, maximum number of waves, and minimum occupancy. The opposite is + // generally true for the minimum group size. LDS or barrier ressource + // limitations can flip those minimums/maximums. + const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F); + auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); + auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); + + // It is possible that we end up with flipped minimum and maximum number of + // waves per CU when the number of minimum/maximum concurrent groups on the CU + // is limited by LDS usage or barrier resources. + if (MinWavesPerCU >= MaxWavesPerCU) { + std::swap(MinWavesPerCU, MaxWavesPerCU); + } else { + const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); + + // Look for a potential smaller group size than the maximum which decreases + // the concurrent number of waves on the CU for the same number of + // concurrent workgroups on the CU. + unsigned MinWavesPerCUForWGSize = + divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU; + if (MinWavesPerCU > MinWavesPerCUForWGSize) { + unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize; + if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) { + // There may exist a smaller group size than the maximum that achieves + // the minimum number of waves per CU. This group size is the largest + // possible size that requires MaxWavesPerWG - E waves where E is + // maximized under the following constraints. + // 1. 0 <= E <= ExcessSlotsPerWG + // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize + MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG, + MaxWavesPerWG - MinWavesPerWG); + } + } - // FIXME: Needs to be a multiple of the group size? - //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); + // Look for a potential larger group size than the minimum which increases + // the concurrent number of waves on the CU for the same number of + // concurrent workgroups on the CU. + unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG; + if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) { + // There may exist a larger group size than the minimum that achieves the + // maximum number of waves per CU. This group size is the smallest + // possible size that requires MinWavesPerWG + L waves where L is + // maximized under the following constraints. + // 1. 0 <= L <= LeftoverSlotsPerWG + // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize + MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG, + ((MaxWGSize - 1) / WaveSize) + 1 - + MinWavesPerWG); + } + } - assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && - "computed invalid occupancy"); - return MaxWaves; + // Return the minimum/maximum number of waves on any EU, assuming that all + // wavefronts are spread across all EUs as evenly as possible. + return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU), + std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)}; } -unsigned -AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { +std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( + const MachineFunction &MF) const { const auto *MFI = MF.getInfo(); - return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); + return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction()); } std::pair diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 7701fef5365841..5944b69ce64162 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -127,11 +127,21 @@ class AMDGPUSubtarget { unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if - /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can + /// be achieved when the only function running on a CU is \p F and each + /// workgroup running the function requires \p LDSBytes bytes of LDS space. + /// This notably depends on the range of allowed flat group sizes for the + /// function and hardware characteristics. + std::pair + getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const; - unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; + /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can + /// be achieved when the only function running on a CU is \p MF. This notably + /// depends on the range of allowed flat group sizes for the function, the + /// amount of per-workgroup LDS space required by the function, and hardware + /// characteristics. + std::pair + getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const; bool isAmdHsaOS() const { return TargetTriple.getOS() == Triple::AMDHSA; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index cb662258b26672..f89f531462c001 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1721,7 +1721,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( if (MFI->Occupancy == 0) { // Fixup the subtarget dependent default value. - MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); + MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second; } auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index f5bbc5482d347c..b00105ae9bd528 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1089,9 +1089,8 @@ bool PreRARematStage::initGCNSchedStage() { return false; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - // Check maximum occupancy - if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == - DAG.MinOccupancy) + // Rematerialization will not help if occupancy is not limited by reg usage. + if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy) return false; // FIXME: This pass will invalidate cached MBBLiveIns for regions @@ -1272,8 +1271,8 @@ void GCNSchedStage::checkScheduling() { return; } - unsigned TargetOccupancy = - std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); + unsigned TargetOccupancy = std::min( + S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); unsigned WavesAfter = std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); unsigned WavesBefore = diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 413c2884c034ea..b5e8e246825c7b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -400,16 +400,16 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { return getBaseReservedNumSGPRs(KernelUsesFlatScratch); } -unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, - unsigned NumSGPRs, - unsigned NumVGPRs) const { - unsigned Occupancy = - std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F)); - if (NumSGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); - if (NumVGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); - return Occupancy; +std::pair +GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, + unsigned NumSGPRs, unsigned NumVGPRs) const { + auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); + unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); + unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); + + // Maximum occupancy may be further limited by high SGPR/VGPR usage. + MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc)); + return {std::min(MinOcc, MaxOcc), MaxOcc}; } unsigned GCNSubtarget::getBaseMaxNumSGPRs( diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e0b0000f757faf..342b211199dca3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1368,12 +1368,18 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; - /// Return occupancy for the given function. Used LDS and a number of - /// registers if provided. - /// Note, occupancy can be affected by the scratch allocation as well, but + /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can + /// be achieved when the only function running on a CU is \p F, each workgroup + /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p + /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a + /// range, so this returns a range as well. + /// + /// Note that occupancy can be affected by the scratch allocation as well, but /// we do not have enough information to compute it. - unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, - unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; + std::pair computeOccupancy(const Function &F, + unsigned LDSSize = 0, + unsigned NumSGPRs = 0, + unsigned NumVGPRs = 0) const; /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 7de64bddf78846..c5efb89d8b2dbc 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -48,7 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, MaxNumWorkGroups = ST.getMaxNumWorkGroups(F); assert(MaxNumWorkGroups.size() == 3); - Occupancy = ST.computeOccupancy(F, getLDSSize()); + Occupancy = ST.computeOccupancy(F, getLDSSize()).second; CallingConv::ID CC = F.getCallingConv(); VRegFlags.reserve(1024); @@ -185,8 +185,7 @@ MachineFunctionInfo *SIMachineFunctionInfo::clone( void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { limitOccupancy(getMaxWavesPerEU()); const GCNSubtarget& ST = MF.getSubtarget(); - limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(), - MF.getFunction())); + limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second); } Register SIMachineFunctionInfo::addPrivateSegmentBuffer( diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 704435dad65d7b..11121e6058770f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3642,18 +3642,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const SIMachineFunctionInfo *MFI = MF.getInfo(); - - unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), - MF.getFunction()); + unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; switch (RC->getID()) { default: return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); case AMDGPU::VGPR_32RegClassID: - return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); + return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF)); case AMDGPU::SGPR_32RegClassID: case AMDGPU::SGPR_LO16RegClassID: - return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); + return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF)); } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll index ab95c226b08b02..27b93872b9f1df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -513,29 +513,29 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v16, v[0:1] +; GFX8-NEXT: flat_load_ushort v14, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v7, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v10, v9, v13 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v11, v16, v0 +; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[14:15], v11 +; GFX8-NEXT: flat_store_short v[6:7], v13 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -661,55 +661,55 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-LABEL: add_v11i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 18, v0 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v18, v[10:11] -; GFX8-NEXT: flat_load_ushort v19, v[12:13] -; GFX8-NEXT: flat_load_ushort v20, v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 18, v2 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v1, v[14:15] -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v4 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_ushort v14, v[14:15] +; GFX8-NEXT: flat_load_ushort v15, v[16:17] +; GFX8-NEXT: flat_load_ushort v16, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v3, v6, v10 +; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v21, v7, v11 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 +; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 ; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v22, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: flat_load_ushort v3, v[6:7] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 +; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v13, v18, v0 +; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v18, v19, v1 +; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v19, v20, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v21, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v22, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[14:15], v13 -; GFX8-NEXT: flat_store_short v[16:17], v18 -; GFX8-NEXT: flat_store_short v[6:7], v19 +; GFX8-NEXT: flat_store_short v[6:7], v14 +; GFX8-NEXT: flat_store_short v[8:9], v15 +; GFX8-NEXT: flat_store_short v[10:11], v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -794,34 +794,34 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v2, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v6, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v8, v9, v13 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v9 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v6, v14, v16 -; GFX8-NEXT: v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v8, v15, v17 -; GFX8-NEXT: v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v9, v7, v15 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll index 0b66185d25f3e2..8db1f46b0342a3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -712,33 +712,33 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] -; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 -; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] -; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] -; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 +; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: s_nop 1 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -747,26 +747,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -775,26 +775,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -945,33 +945,33 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] -; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 -; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] -; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] -; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 +; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: s_nop 1 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -980,26 +980,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1008,26 +1008,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1106,7 +1106,7 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1115,23 +1115,23 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1266,7 +1266,7 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1275,23 +1275,23 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,7 +1493,7 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1502,23 +1502,23 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1725,33 +1725,33 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] -; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 -; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] -; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] -; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 +; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: s_nop 1 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1760,26 +1760,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1788,26 +1788,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 5d76b542fad894..e60739fd84059b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -7678,274 +7678,274 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v25, 63 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19 -; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v26 -; GFX6-NEXT: v_or_b32_e32 v17, v17, v21 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v22 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 -; GFX6-NEXT: v_not_b32_e32 v8, v16 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23 -; GFX6-NEXT: v_add_i32_e32 v24, vcc, v23, v25 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v24 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 +; GFX6-NEXT: v_not_b32_e32 v16, v16 +; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 +; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 +; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 +; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24 +; GFX6-NEXT: v_not_b32_e32 v25, 63 +; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25 +; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18 -; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v19 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v26, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17 +; GFX6-NEXT: v_or_b32_e32 v3, v16, v19 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 ; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14 -; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v25 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14 +; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v15 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v18, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX6-NEXT: v_or_b32_e32 v6, v17, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v25, 63 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v17, v17, v21 -; GFX8-NEXT: v_or_b32_e32 v18, v18, v22 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 -; GFX8-NEXT: v_not_b32_e32 v8, v16 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v23, v25 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v16, v16 +; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 +; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v25, 63 +; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25 +; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v26, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v3, v16, v19 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 ; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v25 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] +; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v18, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v17, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 -; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 +; GFX9-NEXT: v_not_b32_e32 v16, v16 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v17, v17, v21 -; GFX9-NEXT: v_or_b32_e32 v18, v18, v22 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v18, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX9-NEXT: v_not_b32_e32 v8, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v17, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23 -; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v23 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23 +; GFX9-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 -; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX9-NEXT: v_or_b32_e32 v0, v25, v2 +; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v19, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc ; GFX9-NEXT: v_not_b32_e32 v8, v20 -; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14 -; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] +; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8 +; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX9-NEXT: v_add_u32_e32 v14, 0xffffffc0, v13 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v22, v1 -; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v17, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index dbc8f12c2c25c4..36a6614a5620cd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -7719,86 +7719,86 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_not_b32_e32 v0, v16 ; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 -; GFX6-NEXT: v_not_b32_e32 v25, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 -; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19 -; GFX6-NEXT: v_or_b32_e32 v21, v0, v21 -; GFX6-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v26 +; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX6-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25 +; GFX6-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0 +; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25 +; GFX6-NEXT: v_not_b32_e32 v26, 63 +; GFX6-NEXT: v_or_b32_e32 v21, v21, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26 +; GFX6-NEXT: v_or_b32_e32 v22, v22, v1 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX6-NEXT: v_and_b32_e32 v22, 0x7f, v16 -; GFX6-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 -; GFX6-NEXT: v_add_i32_e32 v24, vcc, v22, v25 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX6-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v22 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX6-NEXT: v_not_b32_e32 v4, v20 -; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v18 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 -; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v19 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v18 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v19 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[14:15], v18 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v1, v23, v1 -; GFX6-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v11, v7 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26 +; GFX6-NEXT: v_or_b32_e32 v16, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v19, v5, v7 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v18, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v7, v9, v11 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i128: @@ -7811,86 +7811,86 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_not_b32_e32 v0, v16 ; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 -; GFX8-NEXT: v_not_b32_e32 v25, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 -; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] -; GFX8-NEXT: v_or_b32_e32 v21, v0, v21 -; GFX8-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[17:18] +; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX8-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25 +; GFX8-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v26, 63 +; GFX8-NEXT: v_or_b32_e32 v21, v21, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26 +; GFX8-NEXT: v_or_b32_e32 v22, v22, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX8-NEXT: v_and_b32_e32 v22, 0x7f, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v22, v25 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX8-NEXT: v_not_b32_e32 v4, v20 -; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v18 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26 ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 -; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v1, v23, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v11, v7 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v16, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v19, v5, v7 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v18, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i128: @@ -7905,83 +7905,83 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 -; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] -; GFX9-NEXT: v_or_b32_e32 v21, v0, v21 -; GFX9-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX9-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25 +; GFX9-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX9-NEXT: v_and_b32_e32 v22, 0x7f, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v22 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX9-NEXT: v_or_b32_e32 v21, v21, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19 +; GFX9-NEXT: v_or_b32_e32 v22, v22, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX9-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v20 -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] -; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 -; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v1, v23, v1 -; GFX9-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v11, v7 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 +; GFX9-NEXT: v_or_b32_e32 v16, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v19, v5, v7 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v18, v7 +; GFX9-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v7, v9, v11 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index df1afdf77983cc..298dfcf048fc46 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -715,27 +715,27 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[4:5] -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 4, v2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11] ; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11] ; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index 75d4d8816fb30d..e8de761540b7a2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -14,167 +14,168 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: v_mov_b32_e32 v4, s0 ; LOOP-NEXT: .LBB0_1: ; %load-store-loop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 +; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 +; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc +; LOOP-NEXT: buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64 +; LOOP-NEXT: s_waitcnt expcnt(5) +; LOOP-NEXT: buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:1 ; LOOP-NEXT: s_waitcnt expcnt(2) -; LOOP-NEXT: v_add_i32_e32 v29, vcc, v2, v4 -; LOOP-NEXT: v_addc_u32_e32 v30, vcc, v3, v5, vcc -; LOOP-NEXT: buffer_load_ubyte v24, v[29:30], s[0:3], 0 addr64 -; LOOP-NEXT: buffer_load_ubyte v27, v[29:30], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: buffer_load_ubyte v34, v[29:30], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_load_ubyte v35, v[29:30], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_load_ubyte v36, v[29:30], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: buffer_load_ubyte v37, v[29:30], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_load_ubyte v38, v[29:30], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_load_ubyte v39, v[29:30], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_load_ubyte v6, v[29:30], s[0:3], 0 addr64 offset:8 -; LOOP-NEXT: buffer_load_ubyte v9, v[29:30], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_load_ubyte v10, v[29:30], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: buffer_load_ubyte v11, v[29:30], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: buffer_load_ubyte v7, v[29:30], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: buffer_load_ubyte v13, v[29:30], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_load_ubyte v14, v[29:30], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_load_ubyte v15, v[29:30], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: buffer_load_ubyte v8, v[29:30], s[0:3], 0 addr64 offset:16 -; LOOP-NEXT: buffer_load_ubyte v17, v[29:30], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_load_ubyte v18, v[29:30], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_load_ubyte v19, v[29:30], s[0:3], 0 addr64 offset:19 -; LOOP-NEXT: buffer_load_ubyte v12, v[29:30], s[0:3], 0 addr64 offset:20 -; LOOP-NEXT: buffer_load_ubyte v21, v[29:30], s[0:3], 0 addr64 offset:21 -; LOOP-NEXT: buffer_load_ubyte v22, v[29:30], s[0:3], 0 addr64 offset:22 -; LOOP-NEXT: buffer_load_ubyte v23, v[29:30], s[0:3], 0 addr64 offset:23 -; LOOP-NEXT: buffer_load_ubyte v16, v[29:30], s[0:3], 0 addr64 offset:24 -; LOOP-NEXT: buffer_load_ubyte v25, v[29:30], s[0:3], 0 addr64 offset:25 -; LOOP-NEXT: buffer_load_ubyte v26, v[29:30], s[0:3], 0 addr64 offset:26 -; LOOP-NEXT: buffer_load_ubyte v28, v[29:30], s[0:3], 0 addr64 offset:27 -; LOOP-NEXT: buffer_load_ubyte v20, v[29:30], s[0:3], 0 addr64 offset:28 -; LOOP-NEXT: buffer_load_ubyte v31, v[29:30], s[0:3], 0 addr64 offset:29 -; LOOP-NEXT: buffer_load_ubyte v32, v[29:30], s[0:3], 0 addr64 offset:30 -; LOOP-NEXT: buffer_load_ubyte v33, v[29:30], s[0:3], 0 addr64 offset:31 +; LOOP-NEXT: buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12 +; LOOP-NEXT: buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_waitcnt vmcnt(14) -; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; LOOP-NEXT: v_or_b32_e32 v24, v27, v24 -; LOOP-NEXT: v_lshlrev_b32_e32 v27, 24, v35 -; LOOP-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; LOOP-NEXT: v_or_b32_e32 v27, v27, v29 -; LOOP-NEXT: v_lshlrev_b32_e32 v29, 8, v37 -; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v39 -; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; LOOP-NEXT: v_or_b32_e32 v29, v29, v36 -; LOOP-NEXT: v_or_b32_e32 v30, v30, v34 -; LOOP-NEXT: v_add_i32_e32 v34, vcc, v0, v4 -; LOOP-NEXT: v_addc_u32_e32 v35, vcc, v1, v5, vcc +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; LOOP-NEXT: v_or_b32_e32 v26, v6, v26 +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v32 +; LOOP-NEXT: v_lshlrev_b32_e32 v7, 16, v31 +; LOOP-NEXT: v_or_b32_e32 v29, v6, v7 +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; LOOP-NEXT: v_lshlrev_b32_e32 v7, 24, v39 +; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v38 +; LOOP-NEXT: v_or_b32_e32 v31, v6, v36 +; LOOP-NEXT: v_or_b32_e32 v32, v7, v32 +; LOOP-NEXT: v_add_i32_e32 v6, vcc, v0, v4 +; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v1, v5, vcc ; LOOP-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v4 -; LOOP-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; LOOP-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; LOOP-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; LOOP-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; LOOP-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; LOOP-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; LOOP-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; LOOP-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; LOOP-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; LOOP-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; LOOP-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; LOOP-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; LOOP-NEXT: s_waitcnt vmcnt(12) -; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; LOOP-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; LOOP-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; LOOP-NEXT: s_waitcnt vmcnt(10) -; LOOP-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; LOOP-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; LOOP-NEXT: s_waitcnt vmcnt(8) -; LOOP-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; LOOP-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; LOOP-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; LOOP-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; LOOP-NEXT: s_waitcnt vmcnt(6) -; LOOP-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; LOOP-NEXT: s_waitcnt vmcnt(4) -; LOOP-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; LOOP-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; LOOP-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; LOOP-NEXT: s_waitcnt vmcnt(2) -; LOOP-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; LOOP-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; LOOP-NEXT: v_or_b32_e32 v6, v9, v6 -; LOOP-NEXT: v_or_b32_e32 v9, v11, v10 -; LOOP-NEXT: v_or_b32_e32 v7, v13, v7 -; LOOP-NEXT: v_or_b32_e32 v10, v15, v14 -; LOOP-NEXT: v_or_b32_e32 v8, v17, v8 -; LOOP-NEXT: v_or_b32_e32 v11, v19, v18 -; LOOP-NEXT: v_or_b32_e32 v12, v21, v12 -; LOOP-NEXT: v_or_b32_e32 v13, v23, v22 -; LOOP-NEXT: v_or_b32_e32 v14, v25, v16 -; LOOP-NEXT: v_or_b32_e32 v15, v28, v26 -; LOOP-NEXT: v_or_b32_e32 v16, v31, v20 -; LOOP-NEXT: v_or_b32_e32 v17, v33, v32 -; LOOP-NEXT: v_or_b32_e32 v18, v27, v24 -; LOOP-NEXT: v_or_b32_e32 v19, v30, v29 -; LOOP-NEXT: v_or_b32_e32 v6, v9, v6 -; LOOP-NEXT: v_or_b32_e32 v7, v10, v7 +; LOOP-NEXT: v_lshlrev_b32_e32 v35, 24, v35 +; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 -; LOOP-NEXT: v_or_b32_e32 v9, v13, v12 -; LOOP-NEXT: v_or_b32_e32 v10, v15, v14 -; LOOP-NEXT: v_or_b32_e32 v11, v17, v16 -; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v18 -; LOOP-NEXT: v_bfe_u32 v13, v18, 8, 8 -; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 -; LOOP-NEXT: v_lshrrev_b32_e32 v14, 24, v18 -; LOOP-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; LOOP-NEXT: v_bfe_u32 v16, v19, 8, 8 -; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: v_lshrrev_b32_e32 v17, 24, v19 +; LOOP-NEXT: v_or_b32_e32 v11, v13, v12 +; LOOP-NEXT: v_or_b32_e32 v9, v15, v9 +; LOOP-NEXT: v_or_b32_e32 v12, v17, v16 +; LOOP-NEXT: v_or_b32_e32 v10, v19, v10 +; LOOP-NEXT: v_or_b32_e32 v13, v21, v20 +; LOOP-NEXT: v_or_b32_e32 v14, v23, v14 +; LOOP-NEXT: v_or_b32_e32 v15, v25, v24 +; LOOP-NEXT: v_or_b32_e32 v16, v27, v18 +; LOOP-NEXT: v_or_b32_e32 v17, v30, v28 +; LOOP-NEXT: v_or_b32_e32 v18, v33, v22 +; LOOP-NEXT: v_or_b32_e32 v19, v35, v34 +; LOOP-NEXT: v_or_b32_e32 v20, v29, v26 +; LOOP-NEXT: v_or_b32_e32 v21, v32, v31 +; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 +; LOOP-NEXT: v_or_b32_e32 v9, v12, v9 +; LOOP-NEXT: v_or_b32_e32 v10, v13, v10 +; LOOP-NEXT: v_or_b32_e32 v11, v15, v14 +; LOOP-NEXT: v_or_b32_e32 v12, v17, v16 +; LOOP-NEXT: v_or_b32_e32 v13, v19, v18 +; LOOP-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; LOOP-NEXT: v_bfe_u32 v15, v20, 8, 8 +; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 +; LOOP-NEXT: v_lshrrev_b32_e32 v16, 24, v20 +; LOOP-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; LOOP-NEXT: v_bfe_u32 v18, v21, 8, 8 +; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: v_lshrrev_b32_e32 v19, 24, v21 ; LOOP-NEXT: s_waitcnt expcnt(1) -; LOOP-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_bfe_u32 v19, v6, 8, 8 -; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v8 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; LOOP-NEXT: v_bfe_u32 v21, v7, 8, 8 -; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; LOOP-NEXT: v_bfe_u32 v23, v8, 8, 8 -; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: v_bfe_u32 v21, v8, 8, 8 +; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:8 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v8 -; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; LOOP-NEXT: v_bfe_u32 v25, v9, 8, 8 -; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; LOOP-NEXT: v_bfe_u32 v23, v9, 8, 8 +; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:12 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; LOOP-NEXT: v_bfe_u32 v27, v10, 8, 8 -; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; LOOP-NEXT: v_bfe_u32 v25, v10, 8, 8 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:16 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; LOOP-NEXT: v_bfe_u32 v29, v11, 8, 8 -; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; LOOP-NEXT: v_bfe_u32 v27, v11, 8, 8 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:20 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; LOOP-NEXT: buffer_store_byte v13, v[34:35], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: buffer_store_byte v12, v[34:35], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_store_byte v14, v[34:35], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_store_byte v16, v[34:35], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_store_byte v15, v[34:35], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_store_byte v17, v[34:35], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 offset:10 -; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: buffer_store_byte v21, v[34:35], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_store_byte v20, v[34:35], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: buffer_store_byte v23, v[34:35], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_store_byte v22, v[34:35], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:19 -; LOOP-NEXT: buffer_store_byte v25, v[34:35], s[0:3], 0 addr64 offset:21 -; LOOP-NEXT: buffer_store_byte v24, v[34:35], s[0:3], 0 addr64 offset:22 -; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:23 -; LOOP-NEXT: buffer_store_byte v27, v[34:35], s[0:3], 0 addr64 offset:25 -; LOOP-NEXT: buffer_store_byte v26, v[34:35], s[0:3], 0 addr64 offset:26 -; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:27 -; LOOP-NEXT: buffer_store_byte v29, v[34:35], s[0:3], 0 addr64 offset:29 -; LOOP-NEXT: buffer_store_byte v28, v[34:35], s[0:3], 0 addr64 offset:30 -; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:31 +; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; LOOP-NEXT: v_bfe_u32 v29, v12, 8, 8 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; LOOP-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; LOOP-NEXT: v_bfe_u32 v31, v13, 8, 8 +; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v13, 24, v13 +; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_store_byte v23, v[6:7], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_store_byte v22, v[6:7], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_store_byte v25, v[6:7], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_store_byte v24, v[6:7], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_store_byte v27, v[6:7], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_store_byte v26, v[6:7], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_store_byte v29, v[6:7], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_store_byte v28, v[6:7], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_store_byte v31, v[6:7], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_1 ; LOOP-NEXT: ; %bb.2: ; %memcpy-split ; LOOP-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 756eb2788607bf..7c6daf769aec28 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2074,208 +2074,208 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v16, v0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX7-NEXT: v_mov_b32_e32 v17, v1 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v18, v23 -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-NEXT: v_mov_b32_e32 v1, v23 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX7-NEXT: v_mov_b32_e32 v2, v22 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc +; GFX7-NEXT: v_mov_b32_e32 v20, v18 +; GFX7-NEXT: v_mov_b32_e32 v18, v19 +; GFX7-NEXT: v_mov_b32_e32 v19, v16 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX7-NEXT: v_mov_b32_e32 v19, v22 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v20, v11 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v16, v0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v18, v23 -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, v23 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v2, v22 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc +; GFX8-NEXT: v_mov_b32_e32 v20, v18 +; GFX8-NEXT: v_mov_b32_e32 v18, v19 +; GFX8-NEXT: v_mov_b32_e32 v19, v16 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX8-NEXT: v_mov_b32_e32 v19, v22 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mov_b32_e32 v20, v11 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v10 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, v20 -; GFX9-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v22 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc +; GFX9-NEXT: v_mov_b32_e32 v20, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v16 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX9-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mov_b32_e32 v20, v11 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index e289ee759da158..4bfd29430ff1ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -1962,8 +1962,9 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 -; GFX6-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17 @@ -1987,70 +1988,69 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v7 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v7 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v7 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v16, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v8 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v8 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v24 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v9 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v8 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v8 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v9 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v25 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v10 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v9 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v9 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v10 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v26 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v11 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v10 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v10 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v11 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v27 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v12 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v11 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v11 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v13 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v12 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v12 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v13 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v29 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v14 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v13 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v13 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v14 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v30 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v14 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v14 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v15 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v15 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2083,8 +2083,9 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 -; GFX8-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 @@ -2108,70 +2109,69 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v7 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v7 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v7 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v16, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v8 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v8 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v24 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v9 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v8 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v8 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v9 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v25 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v10 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v9 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v9 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v10 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v26 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v11 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v10 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v10 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v11 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v27 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v12 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v11 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v11 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v13 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v12 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v12 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v13 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v29 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v14 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v13 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v13 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v14 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v30 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v14 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v14 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v15 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v15 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 80cda2e7f3c816..0c9ff3eee8231c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -429,190 +429,193 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v5, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 ; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v14, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v11, v12, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v18, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v16, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v15, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v18, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v19, v[10:11] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v19, v10 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_xor_b32_e32 v1, v4, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v12, v13 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -622,23 +625,23 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1189,123 +1192,123 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_trunc_f32_e32 v8, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mov_b32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v15, vcc +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -1313,34 +1316,34 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 ; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1348,19 +1351,19 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1421,178 +1424,178 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_trunc_f32_e32 v8, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mov_b32_e32 v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v18, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v19, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15 -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1851,6 +1854,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v13, v11 @@ -1861,22 +1865,22 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v7, v12 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v11 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 ; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v18, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v18, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc @@ -1891,24 +1895,24 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v7 +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7 ; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v15, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 ; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7 ; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1917,164 +1921,166 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v1 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v14, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v15, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v14, v[11:12] -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v16, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, v12, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v14 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v18, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v16, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v18, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v21, v19, v[11:12] -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v11 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v18, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 +; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v12, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v11, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] ; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] @@ -2085,7 +2091,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 @@ -2099,8 +2105,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 5f568839a28dd3..40f29c56c8f127 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1537,36 +1537,36 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s2, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 -; GFX8-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 +; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 +; GFX8-NEXT: v_mov_b32_e32 v6, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9 ; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX8-NEXT: v_mov_b32_e32 v8, s6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 @@ -1635,7 +1635,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] @@ -1680,206 +1679,206 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1] +; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v1, s11, v2 +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, v13, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 ; GFX9-NEXT: s_add_u32 s2, s2, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v16, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v17, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v17 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v16, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v14, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v17 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v4, v12, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s20, v18, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v3, v14, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v16, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX9-NEXT: v_mul_hi_u32 v11, v18, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v14, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 +; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v11, v14, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v18, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v11, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v11, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v18, v1 -; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s5, v11, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2] -; GFX9-NEXT: v_xor_b32_e32 v8, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v9, s17, v9 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s16, v8 -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v9, v10, vcc -; GFX9-NEXT: v_mul_hi_u32 v9, v11, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, v12, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, v11, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 +; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 +; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc +; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s9, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, s9, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, s8, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 ; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v7 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc -; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v3 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s2, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s2, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 -; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 +; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 +; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 +; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10 -; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9 -; GFX9-NEXT: v_mov_b32_e32 v9, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, s6, v8 -; GFX9-NEXT: v_mov_b32_e32 v9, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[14:15] +; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc +; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 1f4448d9a632a0..df645888626c6d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -419,24 +419,24 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 ; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 ; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 ; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -445,190 +445,191 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v9 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v12, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v12, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v11, v0 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v12, v10 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] ; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v16, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v7, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v9, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v17, v[8:9] -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v8 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v13, v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v8 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -1117,93 +1118,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_trunc_f32_e32 v8, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mov_b32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc @@ -1216,22 +1220,19 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -1239,34 +1240,34 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] ; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 ; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1274,19 +1275,19 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1345,96 +1346,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_trunc_f32_e32 v8, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mov_b32_e32 v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 @@ -1443,78 +1444,78 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] +; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1710,93 +1711,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_trunc_f32_e32 v8, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mov_b32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc @@ -1809,22 +1813,19 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -1832,34 +1833,34 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] ; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 ; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1867,19 +1868,19 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1938,96 +1939,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_trunc_f32_e32 v8, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mov_b32_e32 v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 @@ -2036,78 +2037,78 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] +; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -2350,7 +2351,6 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000 ; GISEL-NEXT: v_mov_b32_e32 v9, 0 ; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4 -; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], v6 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc @@ -2425,172 +2425,175 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v10, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v13, v[10:11] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v0 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v15, v10, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v0, v7, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v9, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v8 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v11, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v10, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v7 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, v9, v1, s[4:5] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v8, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v9 -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v14, v5 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v17, v[9:10] -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v17, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v20, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6 +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 +; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 +; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8 +; GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1] +; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v13, v10 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mov_b32_e32 v1, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v17, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v20, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v20, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v14, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v15, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v14, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v7 -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v13, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 @@ -2605,13 +2608,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 43ebe156eb2a28..5673a6c6e869d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -1965,8 +1965,9 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 @@ -1990,70 +1991,69 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v7 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v7 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v7 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v8 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v8 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v9 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v9 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v10 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v10 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v11 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v11 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v12 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v13 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v14 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v15 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v18, v16 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2086,8 +2086,9 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 @@ -2111,70 +2112,69 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v7 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v7 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v8 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v8 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v9 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v9 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v10 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v10 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v11 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v11 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v12 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v13 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v14 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v15 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v18, v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 1ee521b3dedac1..f5a901b024ef52 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -365,256 +365,256 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_udiv_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v8 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 -; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v20, v4, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v6, v12 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v12, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[16:17] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v19, v4 -; GISEL-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v0, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v2, s[12:13] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v1, v16, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v16 -; GISEL-NEXT: v_subb_u32_e64 v16, s[6:7], v3, v4, s[8:9] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19] -; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v18, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v17, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64: @@ -1252,256 +1252,256 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v10, 0 ; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v18, v6 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 -; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7 -; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v16, v20 -; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v12, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v19, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 -; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[14:15], v17, v8 -; GISEL-NEXT: v_addc_u32_e64 v17, s[10:11], 0, v0, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[18:19] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8 -; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, vcc, v3, v2, s[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[14:15] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v4, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v19, s[6:7] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v13, v18, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: @@ -1904,16 +1904,14 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_udiv_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 @@ -1929,76 +1927,78 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v18 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 @@ -2007,140 +2007,140 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v20, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v21 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v18 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v15 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v13, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v13, s[6:7], 0, v13 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v14, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v15, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v13, vcc, 0, v13, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11] +; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9] +; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v17, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v20, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, v19, v21, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index ffebde52df4a3e..e3c1a52696b47c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1087,95 +1087,95 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s14 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 -; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 ; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX8-NEXT: v_trunc_f32_e32 v14, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v14 +; GFX8-NEXT: v_trunc_f32_e32 v3, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v3 ; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v12, 0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v3 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v12, v[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3] -; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12 -; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3] -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1 -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v3, v15, v1 +; GFX8-NEXT: v_mul_lo_u32 v17, v12, v2 +; GFX8-NEXT: v_mul_hi_u32 v5, v12, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v15, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, v14, v2 -; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 +; GFX8-NEXT: v_mul_hi_u32 v17, v12, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 +; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v1 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v12, 0 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v17, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v15, v[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v12, v[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v12, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v19, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v15, v3 +; GFX8-NEXT: v_mul_lo_u32 v9, v12, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v8, v15, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, v12, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v20, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, v14, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 +; GFX8-NEXT: v_mul_lo_u32 v8, v15, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v12, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc ; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3 ; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] @@ -1216,27 +1216,27 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 -; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v14 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v7 +; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v14, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; GFX8-NEXT: v_mov_b32_e32 v9, s4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] @@ -1330,182 +1330,181 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v9, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v8, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v1 -; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v1 +; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v4, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s17, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s4, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v8 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX9-NEXT: v_trunc_f32_e32 v15, v4 -; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v15 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX9-NEXT: v_trunc_f32_e32 v4, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 +; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 -; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5] -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 +; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v12, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s4, v11 -; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 +; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 +; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v3 -; GFX9-NEXT: v_add3_u32 v4, v6, v5, v4 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v4, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v18, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s3, v16, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, v15, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, v16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v20, vcc -; GFX9-NEXT: v_mul_hi_u32 v11, v16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v11 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v6, v17 +; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 +; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v7, v15, v4 +; GFX9-NEXT: v_mul_lo_u32 v8, v12, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v12, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v20, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v11, v15, v6 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v15, v5 +; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, v12, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v16, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v15, v6 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_add_u32_e32 v9, v11, v9 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s19, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, s18, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v2, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, s19, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s19, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s18, v6 -; GFX9-NEXT: v_mul_hi_u32 v13, s19, v6 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s6, v12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, v11, v9 -; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v9, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v12, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s18, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, s19, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s6, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s6, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[14:15] +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v5, v8, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v8, s18, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s19, v5 +; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, s18, v5 +; GFX9-NEXT: v_mul_hi_u32 v12, s19, v5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v9, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, v10, v8 +; GFX9-NEXT: v_add3_u32 v8, v0, v1, v12 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[0:1] +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s18, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v9, v0, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_sub_u32_e32 v0, s19, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s6, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 1, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s6, v10 +; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[0:1] +; GFX9-NEXT: global_store_dwordx4 v13, v[2:5], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v13, v[6:9], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index faad7e93da5d37..2be4b52198b455 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -359,254 +359,254 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_urem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 -; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 -; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 -; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] -; GISEL-NEXT: v_subbrev_u32_e64 v19, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v19, v7 -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v19, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64: @@ -1103,20 +1103,20 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s4, 0, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 ; GISEL-NEXT: s_cmp_lg_u32 s5, 0 -; GISEL-NEXT: s_subb_u32 s5, 0, 0 +; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: v_trunc_f32_e32 v7, v7 ; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v8, v7, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s4, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6 ; GISEL-NEXT: v_mul_hi_u32 v11, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, s5, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v8 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 ; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9 @@ -1134,41 +1134,41 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v17, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v18 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v11 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v7, v10, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, s4, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, s6, v11 ; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s5, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, s7, v6 ; GISEL-NEXT: v_mul_hi_u32 v15, v6, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, v10, v5 ; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 @@ -1176,9 +1176,9 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_mul_lo_u32 v5, v7, v5 ; GISEL-NEXT: v_mul_lo_u32 v19, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 @@ -1186,38 +1186,38 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v15, v11, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v21, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v5 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v19, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v11 @@ -1675,254 +1675,254 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v10, 0 ; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v4, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v4 -; GISEL-NEXT: v_sub_i32_e64 v7, s[14:15], v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v4, s[16:17], v13, v4 -; GISEL-NEXT: v_add_i32_e64 v6, s[18:19], v17, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 -; GISEL-NEXT: v_subb_u32_e64 v6, s[6:7], v3, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v5, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v8 -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v12, v7, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: @@ -2319,16 +2319,14 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_urem_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 @@ -2344,76 +2342,78 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v18 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 @@ -2422,136 +2422,136 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v20, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v21 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v4, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v3, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v11, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 0042d34e235d17..4faa7edadf07a5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1346,29 +1346,29 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_min_u32_e32 v17, v11, v27 -; GFX6-NEXT: v_min_u32_e32 v18, v12, v28 -; GFX6-NEXT: v_min_u32_e32 v19, v13, v29 -; GFX6-NEXT: v_min_u32_e32 v20, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v19 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v20 +; GFX6-NEXT: v_min_u32_e32 v17, v4, v20 +; GFX6-NEXT: v_min_u32_e32 v18, v5, v21 +; GFX6-NEXT: v_min_u32_e32 v19, v6, v22 +; GFX6-NEXT: v_min_u32_e32 v20, v7, v23 +; GFX6-NEXT: v_min_u32_e32 v21, v8, v24 +; GFX6-NEXT: v_min_u32_e32 v22, v9, v25 +; GFX6-NEXT: v_min_u32_e32 v23, v10, v26 +; GFX6-NEXT: v_min_u32_e32 v24, v11, v27 +; GFX6-NEXT: v_min_u32_e32 v25, v12, v28 +; GFX6-NEXT: v_min_u32_e32 v26, v13, v29 +; GFX6-NEXT: v_min_u32_e32 v27, v14, v30 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_min_u32_e32 v16, v15, v16 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll index daed0986fa9c88..0ae2b4f549919d 100644 --- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll @@ -823,32 +823,32 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) { ; GFX8-NEXT: v_sub_u16_sdwa v14, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0 +; GFX8-NEXT: v_max_i16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v0, v0, v19 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_sub_u16_e32 v8, 0, v1 +; GFX8-NEXT: v_max_i16_sdwa v15, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_sub_u16_e32 v16, 0, v7 ; GFX8-NEXT: v_sub_u16_e32 v17, 0, v6 ; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5 ; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v21, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v22, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v23, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v8, 0, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_sub_u16_e32 v15, 0, v2 ; GFX8-NEXT: v_max_i16_sdwa v9, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v12, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v13, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v14, v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v15, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v0, v0, v23 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v22 -; GFX8-NEXT: v_max_i16_e32 v2, v2, v21 -; GFX8-NEXT: v_max_i16_e32 v3, v3, v20 +; GFX8-NEXT: v_max_i16_e32 v2, v2, v15 +; GFX8-NEXT: v_max_i16_e32 v3, v3, v8 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v19 ; GFX8-NEXT: v_max_i16_e32 v5, v5, v18 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v17 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v14 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v12 @@ -1255,85 +1255,85 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v16, 0 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v0 -; GFX8-NEXT: v_max_i16_sdwa v19, v0, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v0, v0, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v1 -; GFX8-NEXT: v_max_i16_sdwa v20, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v1, v1, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v2 -; GFX8-NEXT: v_max_i16_sdwa v19, v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v2, v2, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v3 -; GFX8-NEXT: v_max_i16_sdwa v20, v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v3, v3, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v4 -; GFX8-NEXT: v_max_i16_sdwa v19, v4, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v4, v4, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v5 -; GFX8-NEXT: v_max_i16_sdwa v20, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v5, v5, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v6 -; GFX8-NEXT: v_max_i16_sdwa v19, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v6, v6, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v7 -; GFX8-NEXT: v_max_i16_sdwa v20, v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v7, v7, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v8 -; GFX8-NEXT: v_max_i16_sdwa v19, v8, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v8, v8, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v8, v8, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v9 -; GFX8-NEXT: v_max_i16_sdwa v20, v9, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v9, v9, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v10 -; GFX8-NEXT: v_max_i16_sdwa v19, v10, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v10, v10, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v10, v10, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v11 -; GFX8-NEXT: v_max_i16_sdwa v20, v11, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v11, v11, v19 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0 +; GFX8-NEXT: v_max_i16_sdwa v18, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v0, v0, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v1 +; GFX8-NEXT: v_max_i16_sdwa v19, v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v1, v1, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v2 +; GFX8-NEXT: v_max_i16_sdwa v18, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v2, v2, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v3 +; GFX8-NEXT: v_max_i16_sdwa v19, v3, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v3, v3, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4 +; GFX8-NEXT: v_max_i16_sdwa v18, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v4, v4, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5 +; GFX8-NEXT: v_max_i16_sdwa v19, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v5, v5, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v6 +; GFX8-NEXT: v_max_i16_sdwa v18, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v6, v6, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v7 +; GFX8-NEXT: v_max_i16_sdwa v19, v7, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v7, v7, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v8 +; GFX8-NEXT: v_max_i16_sdwa v18, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v8, v8, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v8, v8, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v9 +; GFX8-NEXT: v_max_i16_sdwa v19, v9, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v9, v9, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v10 +; GFX8-NEXT: v_max_i16_sdwa v18, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v10, v10, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v11 +; GFX8-NEXT: v_max_i16_sdwa v19, v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v11, v11, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v12 +; GFX8-NEXT: v_max_i16_sdwa v18, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v12, v12, v19 ; GFX8-NEXT: v_sub_u16_sdwa v17, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v11, v11, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v12 -; GFX8-NEXT: v_max_i16_sdwa v16, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v12, v12, v20 -; GFX8-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX8-NEXT: v_sub_u16_e32 v16, 0, v13 -; GFX8-NEXT: v_max_i16_sdwa v19, v13, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v15 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v16 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v12, v12, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v13 +; GFX8-NEXT: v_max_i16_sdwa v16, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v13, v13, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v15 +; GFX8-NEXT: v_or_b32_e32 v13, v13, v16 ; GFX8-NEXT: v_sub_u16_e32 v16, 0, v14 ; GFX8-NEXT: v_max_i16_sdwa v17, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v18, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v19, v14, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_e32 v14, v14, v16 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v20 -; GFX8-NEXT: v_or_b32_e32 v13, v13, v19 -; GFX8-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v18 +; GFX8-NEXT: v_or_b32_e32 v14, v14, v19 ; GFX8-NEXT: v_or_b32_e32 v15, v15, v17 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 033af692438015..cd5b585a8c4e23 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -474,44 +474,44 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s4, s11, s39 -; GFX6-NEXT: s_add_i32 s5, s10, s38 -; GFX6-NEXT: s_add_i32 s6, s9, s37 -; GFX6-NEXT: s_add_i32 s7, s8, s36 -; GFX6-NEXT: s_add_i32 s8, s15, s43 -; GFX6-NEXT: s_add_i32 s9, s14, s42 -; GFX6-NEXT: s_add_i32 s10, s13, s41 -; GFX6-NEXT: s_add_i32 s11, s12, s40 -; GFX6-NEXT: s_add_i32 s12, s19, s47 -; GFX6-NEXT: s_add_i32 s13, s18, s46 -; GFX6-NEXT: s_add_i32 s14, s17, s45 -; GFX6-NEXT: s_add_i32 s15, s16, s44 -; GFX6-NEXT: s_add_i32 s16, s23, s51 -; GFX6-NEXT: s_add_i32 s17, s22, s50 -; GFX6-NEXT: s_add_i32 s18, s21, s49 -; GFX6-NEXT: s_add_i32 s19, s20, s48 -; GFX6-NEXT: v_mov_b32_e32 v0, s19 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: s_add_i32 s6, s11, s39 +; GFX6-NEXT: s_add_i32 s7, s10, s38 +; GFX6-NEXT: s_add_i32 s10, s15, s43 +; GFX6-NEXT: s_add_i32 s11, s14, s42 +; GFX6-NEXT: s_add_i32 s14, s19, s47 +; GFX6-NEXT: s_add_i32 s15, s18, s46 +; GFX6-NEXT: s_add_i32 s18, s23, s51 +; GFX6-NEXT: s_add_i32 s19, s22, s50 +; GFX6-NEXT: s_add_i32 s21, s21, s49 +; GFX6-NEXT: s_add_i32 s20, s20, s48 +; GFX6-NEXT: s_add_i32 s17, s17, s45 +; GFX6-NEXT: s_add_i32 s16, s16, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NEXT: v_mov_b32_e32 v3, s18 +; GFX6-NEXT: s_add_i32 s13, s13, s41 +; GFX6-NEXT: s_add_i32 s12, s12, s40 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NEXT: s_add_i32 s9, s9, s37 +; GFX6-NEXT: s_add_i32 s8, s8, s36 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 236956c1829e77..f176f34f847366 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -485,13 +485,10 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) { ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[24:25] ; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; HSA-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[26:27] ; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; HSA-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[28:29] ; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc @@ -500,13 +497,10 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) { ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; HSA-NEXT: v_cndmask_b32_e32 v5, -1, v10, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; HSA-NEXT: v_cndmask_b32_e64 v13, -1, v26, s[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v6, -1, v12, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; HSA-NEXT: v_cndmask_b32_e64 v12, -1, v24, s[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v7, -1, v14, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; HSA-NEXT: v_cndmask_b32_e64 v14, -1, v28, s[8:9] ; HSA-NEXT: v_cndmask_b32_e32 v8, -1, v16, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; HSA-NEXT: v_cndmask_b32_e32 v9, -1, v18, vcc @@ -514,6 +508,12 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) { ; HSA-NEXT: v_cndmask_b32_e32 v10, -1, v20, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] ; HSA-NEXT: v_cndmask_b32_e32 v11, -1, v22, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] +; HSA-NEXT: v_cndmask_b32_e32 v12, -1, v24, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27] +; HSA-NEXT: v_cndmask_b32_e32 v13, -1, v26, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[28:29] +; HSA-NEXT: v_cndmask_b32_e32 v14, -1, v28, vcc ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31] ; HSA-NEXT: v_cndmask_b32_e32 v15, -1, v30, vcc @@ -733,65 +733,64 @@ define <16 x ptr> @addrspacecast_v16p5_to_v16p0(<16 x ptr addrspace(5)> %ptr) { ; CI-NEXT: s_load_dword s4, s[6:7], 0x11 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v31, s4 -; CI-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; CI-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; CI-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; CI-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; CI-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; CI-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; CI-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; CI-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; CI-NEXT: v_mov_b32_e32 v1, v48 -; CI-NEXT: v_mov_b32_e32 v2, v35 -; CI-NEXT: v_mov_b32_e32 v3, v33 -; CI-NEXT: v_mov_b32_e32 v4, v36 -; CI-NEXT: v_mov_b32_e32 v5, v49 -; CI-NEXT: v_mov_b32_e32 v6, v37 -; CI-NEXT: v_mov_b32_e32 v7, v34 -; CI-NEXT: v_mov_b32_e32 v8, v38 -; CI-NEXT: v_mov_b32_e32 v10, v50 -; CI-NEXT: v_mov_b32_e32 v12, v39 -; CI-NEXT: v_mov_b32_e32 v14, v32 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; CI-NEXT: v_mov_b32_e32 v1, v49 +; CI-NEXT: v_mov_b32_e32 v2, v34 +; CI-NEXT: v_mov_b32_e32 v3, v39 +; CI-NEXT: v_mov_b32_e32 v4, v35 +; CI-NEXT: v_mov_b32_e32 v5, v32 +; CI-NEXT: v_mov_b32_e32 v6, v36 +; CI-NEXT: v_mov_b32_e32 v8, v48 +; CI-NEXT: v_mov_b32_e32 v10, v37 +; CI-NEXT: v_mov_b32_e32 v12, v33 +; CI-NEXT: v_mov_b32_e32 v14, v38 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: addrspacecast_v16p5_to_v16p0: @@ -801,63 +800,62 @@ define <16 x ptr> @addrspacecast_v16p5_to_v16p0(<16 x ptr addrspace(5)> %ptr) { ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v31, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; GFX9-NEXT: v_mov_b32_e32 v1, v48 -; GFX9-NEXT: v_mov_b32_e32 v2, v35 -; GFX9-NEXT: v_mov_b32_e32 v3, v33 -; GFX9-NEXT: v_mov_b32_e32 v4, v36 -; GFX9-NEXT: v_mov_b32_e32 v5, v49 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v34 -; GFX9-NEXT: v_mov_b32_e32 v8, v38 -; GFX9-NEXT: v_mov_b32_e32 v10, v50 -; GFX9-NEXT: v_mov_b32_e32 v12, v39 -; GFX9-NEXT: v_mov_b32_e32 v14, v32 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; GFX9-NEXT: v_mov_b32_e32 v1, v49 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v39 +; GFX9-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-NEXT: v_mov_b32_e32 v5, v32 +; GFX9-NEXT: v_mov_b32_e32 v6, v36 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v10, v37 +; GFX9-NEXT: v_mov_b32_e32 v12, v33 +; GFX9-NEXT: v_mov_b32_e32 v14, v38 ; GFX9-NEXT: s_setpc_b64 s[30:31] %cast = addrspacecast <16 x ptr addrspace(5)> %ptr to <16 x ptr> ret <16 x ptr> %cast @@ -939,13 +937,10 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) { ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[24:25] ; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; HSA-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[26:27] ; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; HSA-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[28:29] ; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc @@ -954,13 +949,10 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) { ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; HSA-NEXT: v_cndmask_b32_e32 v5, -1, v10, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; HSA-NEXT: v_cndmask_b32_e64 v13, -1, v26, s[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v6, -1, v12, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; HSA-NEXT: v_cndmask_b32_e64 v12, -1, v24, s[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v7, -1, v14, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; HSA-NEXT: v_cndmask_b32_e64 v14, -1, v28, s[8:9] ; HSA-NEXT: v_cndmask_b32_e32 v8, -1, v16, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; HSA-NEXT: v_cndmask_b32_e32 v9, -1, v18, vcc @@ -968,6 +960,12 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) { ; HSA-NEXT: v_cndmask_b32_e32 v10, -1, v20, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] ; HSA-NEXT: v_cndmask_b32_e32 v11, -1, v22, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] +; HSA-NEXT: v_cndmask_b32_e32 v12, -1, v24, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27] +; HSA-NEXT: v_cndmask_b32_e32 v13, -1, v26, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[28:29] +; HSA-NEXT: v_cndmask_b32_e32 v14, -1, v28, vcc ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31] ; HSA-NEXT: v_cndmask_b32_e32 v15, -1, v30, vcc @@ -1187,65 +1185,64 @@ define <16 x ptr> @addrspacecast_v16p3_to_v16p0(<16 x ptr addrspace(3)> %ptr) { ; CI-NEXT: s_load_dword s4, s[6:7], 0x10 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v31, s4 -; CI-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; CI-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; CI-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; CI-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; CI-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; CI-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; CI-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; CI-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; CI-NEXT: v_mov_b32_e32 v1, v48 -; CI-NEXT: v_mov_b32_e32 v2, v35 -; CI-NEXT: v_mov_b32_e32 v3, v33 -; CI-NEXT: v_mov_b32_e32 v4, v36 -; CI-NEXT: v_mov_b32_e32 v5, v49 -; CI-NEXT: v_mov_b32_e32 v6, v37 -; CI-NEXT: v_mov_b32_e32 v7, v34 -; CI-NEXT: v_mov_b32_e32 v8, v38 -; CI-NEXT: v_mov_b32_e32 v10, v50 -; CI-NEXT: v_mov_b32_e32 v12, v39 -; CI-NEXT: v_mov_b32_e32 v14, v32 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; CI-NEXT: v_mov_b32_e32 v1, v49 +; CI-NEXT: v_mov_b32_e32 v2, v34 +; CI-NEXT: v_mov_b32_e32 v3, v39 +; CI-NEXT: v_mov_b32_e32 v4, v35 +; CI-NEXT: v_mov_b32_e32 v5, v32 +; CI-NEXT: v_mov_b32_e32 v6, v36 +; CI-NEXT: v_mov_b32_e32 v8, v48 +; CI-NEXT: v_mov_b32_e32 v10, v37 +; CI-NEXT: v_mov_b32_e32 v12, v33 +; CI-NEXT: v_mov_b32_e32 v14, v38 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: addrspacecast_v16p3_to_v16p0: @@ -1255,63 +1252,62 @@ define <16 x ptr> @addrspacecast_v16p3_to_v16p0(<16 x ptr addrspace(3)> %ptr) { ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v31, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; GFX9-NEXT: v_mov_b32_e32 v1, v48 -; GFX9-NEXT: v_mov_b32_e32 v2, v35 -; GFX9-NEXT: v_mov_b32_e32 v3, v33 -; GFX9-NEXT: v_mov_b32_e32 v4, v36 -; GFX9-NEXT: v_mov_b32_e32 v5, v49 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v34 -; GFX9-NEXT: v_mov_b32_e32 v8, v38 -; GFX9-NEXT: v_mov_b32_e32 v10, v50 -; GFX9-NEXT: v_mov_b32_e32 v12, v39 -; GFX9-NEXT: v_mov_b32_e32 v14, v32 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; GFX9-NEXT: v_mov_b32_e32 v1, v49 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v39 +; GFX9-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-NEXT: v_mov_b32_e32 v5, v32 +; GFX9-NEXT: v_mov_b32_e32 v6, v36 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v10, v37 +; GFX9-NEXT: v_mov_b32_e32 v12, v33 +; GFX9-NEXT: v_mov_b32_e32 v14, v38 ; GFX9-NEXT: s_setpc_b64 s[30:31] %cast = addrspacecast <16 x ptr addrspace(3)> %ptr to <16 x ptr> ret <16 x ptr> %cast @@ -1550,13 +1546,9 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) { ; HSA-LABEL: addrspacecast_v16p6_to_v16p0: ; HSA: ; %bb.0: ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HSA-NEXT: v_mov_b32_e32 v30, v15 ; HSA-NEXT: v_mov_b32_e32 v28, v14 -; HSA-NEXT: v_mov_b32_e32 v26, v13 ; HSA-NEXT: v_mov_b32_e32 v24, v12 -; HSA-NEXT: v_mov_b32_e32 v22, v11 ; HSA-NEXT: v_mov_b32_e32 v20, v10 -; HSA-NEXT: v_mov_b32_e32 v18, v9 ; HSA-NEXT: v_mov_b32_e32 v16, v8 ; HSA-NEXT: v_mov_b32_e32 v14, v7 ; HSA-NEXT: v_mov_b32_e32 v12, v6 @@ -1569,6 +1561,10 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) { ; HSA-NEXT: v_mov_b32_e32 v3, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0 ; HSA-NEXT: v_mov_b32_e32 v7, 0 +; HSA-NEXT: v_mov_b32_e32 v18, v9 +; HSA-NEXT: v_mov_b32_e32 v22, v11 +; HSA-NEXT: v_mov_b32_e32 v26, v13 +; HSA-NEXT: v_mov_b32_e32 v30, v15 ; HSA-NEXT: v_mov_b32_e32 v9, 0 ; HSA-NEXT: v_mov_b32_e32 v11, 0 ; HSA-NEXT: v_mov_b32_e32 v13, 0 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 823db84a053b8c..4ce46bbaf45ac1 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -104,13 +104,12 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v39, a1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v39 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v39 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_write_b32 a16, v32 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse @@ -369,7 +368,7 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; FIXME: This case is broken. The asm value passed in v32 is live ; through the range where the reserved def for the copy is introduced, ; clobbering the user value. -define void @v32_asm_def_use(float %v0, float %v1) #0 { +define void @v32_asm_def_use(float %v0, float %v1) #4 { ; GFX908-LABEL: v32_asm_def_use: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1002,13 +1001,12 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v39, a1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a32, v39 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_read_b32 v33, a1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v39 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_write_b32 a32, v33 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse @@ -1147,3 +1145,4 @@ attributes #0 = { "amdgpu-waves-per-eu"="6,6" } attributes #1 = { convergent nounwind readnone willreturn } attributes #2 = { nounwind readnone willreturn } attributes #3 = { "amdgpu-waves-per-eu"="7,7" } +attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index a6d8c6f41eee59..3e19ee5567929c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 77 -; TRAP-HANDLER-DISABLE: NumSgprs: 92 +; TRAP-HANDLER-ENABLE: NumSgprs: 61 +; TRAP-HANDLER-DISABLE: NumSgprs: 77 define amdgpu_kernel void @amdhsa_trap_num_sgprs( ptr addrspace(1) %out0, i32 %in0, ptr addrspace(1) %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 8e3c905b0eae5b..ec469b3020ccee 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -662,14 +662,12 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen @@ -677,9 +675,9 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen @@ -687,60 +685,63 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 ; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; GCN-NEXT: buffer_store_dword v3, v22, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -758,14 +759,6 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0 -; GFX7-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GFX7-NEXT: v_add_i32_e32 v23, vcc, 36, v0 -; GFX7-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GFX7-NEXT: v_add_i32_e32 v25, vcc, 28, v0 -; GFX7-NEXT: v_add_i32_e32 v26, vcc, 24, v0 -; GFX7-NEXT: v_add_i32_e32 v27, vcc, 20, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen @@ -809,26 +802,34 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: buffer_store_dword v14, v25, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1335,83 +1336,83 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 +; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GCN-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v10, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v9, v0, v1, 16 -; GCN-NEXT: v_alignbit_b32 v8, v6, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v7, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v14, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v14, v0, v14, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16 +; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17 -; GCN-NEXT: v_alignbit_b32 v17, v6, v18, 16 -; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26 +; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1421,78 +1422,78 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_alignbit_b32 v11, v7, v10, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v28, v7, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 ; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 -; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20 ; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 -; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 @@ -1564,207 +1565,203 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[16:17], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 ; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16 +; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 ; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16 +; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16 +; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16 +; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: s_waitcnt vmcnt(13) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: s_waitcnt vmcnt(11) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16 +; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16 +; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16 +; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_alignbit_b32 v3, v1, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_alignbit_b32 v1, v1, v13, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v6, v5, v19, 16 -; GCN-NEXT: v_alignbit_b32 v5, v13, v21, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v22 -; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16 +; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16 +; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_alignbit_b32 v7, v8, v15, 16 -; GCN-NEXT: v_alignbit_b32 v11, v9, v20, 16 -; GCN-NEXT: v_alignbit_b32 v10, v21, v10, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v14, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v15, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v14, v19, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -1780,24 +1777,27 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 @@ -1832,16 +1832,97 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 ; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39 +; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49 +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 +; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 +; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 +; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 +; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 +; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1852,124 +1933,39 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 ; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 ; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v28 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v39 -; GFX7-NEXT: v_alignbit_b32 v36, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v48 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; GFX7-NEXT: v_alignbit_b32 v35, v18, v19, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v0, v1, 16 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v33, v6, v14, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29 -; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16 -; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 -; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25 -; GFX7-NEXT: v_alignbit_b32 v16, v16, v20, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v20, 16 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(13) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GFX7-NEXT: v_alignbit_b32 v20, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(11) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(10) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v35 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(4) ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v34 -; GFX7-NEXT: v_alignbit_b32 v25, v0, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v24, v22, v23, 16 -; GFX7-NEXT: v_alignbit_b32 v23, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v22, v0, v1, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[31:32], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[31:32], s[4:7], 0 addr64 offset:64 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 @@ -4880,12 +4876,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v21, s30, 0 -; GCN-NEXT: v_writelane_b32 v21, s31, 1 +; GCN-NEXT: v_writelane_b32 v20, s30, 0 +; GCN-NEXT: v_writelane_b32 v20, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -4911,36 +4907,36 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 22, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 20, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 18, v16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 16, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 14, v16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v16 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_store_short v11, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 6, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_store_short v10, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 2, v16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4951,30 +4947,30 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_store_short v9, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v8, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v21, 1 -; GCN-NEXT: v_readlane_b32 s30, v21, 0 +; GCN-NEXT: v_readlane_b32 s31, v20, 1 +; GCN-NEXT: v_readlane_b32 s30, v20, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 @@ -5365,10 +5361,10 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0 ; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) @@ -5587,20 +5583,20 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 @@ -5617,11 +5613,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7618,197 +7614,197 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26 ; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28 ; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30 -; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50 -; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52 -; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54 -; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56 -; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58 -; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60 -; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62 +; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50 +; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52 +; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54 +; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56 +; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58 +; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60 +; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62 ; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34 ; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36 ; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38 -; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40 -; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42 +; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 +; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 ; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 ; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xfc, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xf4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xec, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xe4, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xdc, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xd8, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd4, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xd0, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xcc, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xc4, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xc0, v0 +; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xb8, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xb4, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xac, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xa8, v0 -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xa4, v0 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa0, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x9c, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0 +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x98, v0 +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x94, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x90, v0 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x88, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x84, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0 +; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x80, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0 +; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x70, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0 -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 +; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 +; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 48, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v31, vcc, 44, v0 +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0 ; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 ; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 +; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 ; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0 ; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -7824,34 +7820,34 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11 -; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen ; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9 ; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12 -; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36 ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v36 -; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13 +; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14 ; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15 ; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GCN-NEXT: buffer_store_dword v6, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -7864,258 +7860,258 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62 -; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60 -; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58 -; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56 -; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54 -; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52 -; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50 -; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34 -; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36 -; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38 -; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 -; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 -; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 -; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 -; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2 -; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6 -; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8 -; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10 -; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12 +; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62 +; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60 +; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58 +; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56 +; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54 +; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52 +; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50 +; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34 +; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36 +; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38 +; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40 +; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42 +; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44 +; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46 +; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2 +; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6 +; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8 +; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10 +; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12 ; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14 -; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18 -; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20 -; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22 -; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24 -; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26 -; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30 +; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18 +; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20 +; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22 +; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24 +; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26 +; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28 +; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfc, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf4, v0 -; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xd8, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xec, v0 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe4, v0 -; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0xd0, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0 +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v25 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; GFX7-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21 -; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xd4, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v20, v24, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xcc, v0 -; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc8, v0 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc4, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0 +; GFX7-NEXT: s_waitcnt vmcnt(14) +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xbc, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v33 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v32 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb4, v0 -; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xac, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v31 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa8, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v30 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v29 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x9c, v0 -; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x98, v0 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x94, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x8c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GFX7-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x88, v0 -; GFX7-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x84, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GFX7-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x80, v0 -; GFX7-NEXT: buffer_store_dword v20, v15, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x7c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v14 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x74, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x70, v0 -; GFX7-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x6c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v13, vcc, 0x68, v0 -; GFX7-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x64, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 +; GFX7-NEXT: s_waitcnt vmcnt(14) +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 +; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v8 -; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0 -; GFX7-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: buffer_store_dword v16, v8, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0 +; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0 +; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7 +; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v16 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x4c, v0 -; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v11 -; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 -; GFX7-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0 ; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GFX7-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 ; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GFX7-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_extload_v32bf16_to_v32f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 2, v1 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 6, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 8, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1 ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1 ; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 14, v1 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 16, v1 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 18, v1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 20, v1 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1 ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1 ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1 @@ -8126,469 +8122,473 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1 ; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, 32, v1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1 ; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v35, vcc, 34, v1 +; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 ; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: v_add_u32_e32 v37, vcc, 36, v1 -; GFX8-NEXT: flat_load_ushort v43, v[1:2] +; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 +; GFX8-NEXT: flat_load_ushort v44, v[1:2] ; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v48, vcc, 38, v1 +; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 ; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v44, v[50:51] +; GFX8-NEXT: flat_load_ushort v45, v[50:51] ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v45, v[50:51] -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 40, v1 +; GFX8-NEXT: flat_load_ushort v46, v[50:51] +; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v46, v[52:53] -; GFX8-NEXT: v_add_u32_e32 v52, vcc, 42, v1 +; GFX8-NEXT: flat_load_ushort v47, v[52:53] +; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v47, v[54:55] -; GFX8-NEXT: v_add_u32_e32 v54, vcc, 44, v1 +; GFX8-NEXT: flat_load_ushort v56, v[54:55] +; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v56, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1 -; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_ushort v57, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 46, v1 +; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v41, vcc, 50, v1 -; GFX8-NEXT: v_addc_u32_e32 v42, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v41, v[41:42] -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v42, v[9:10] -; GFX8-NEXT: flat_load_ushort v9, v[35:36] -; GFX8-NEXT: flat_load_ushort v10, v[37:38] -; GFX8-NEXT: flat_load_ushort v35, v[48:49] -; GFX8-NEXT: flat_load_ushort v36, v[50:51] -; GFX8-NEXT: flat_load_ushort v37, v[52:53] -; GFX8-NEXT: flat_load_ushort v48, v[54:55] -; GFX8-NEXT: flat_load_ushort v39, v[39:40] -; GFX8-NEXT: flat_load_ushort v49, v[1:2] -; GFX8-NEXT: flat_load_ushort v50, v[3:4] -; GFX8-NEXT: flat_load_ushort v51, v[5:6] -; GFX8-NEXT: flat_load_ushort v52, v[7:8] -; GFX8-NEXT: flat_load_ushort v53, v[11:12] -; GFX8-NEXT: flat_load_ushort v38, v[13:14] -; GFX8-NEXT: flat_load_ushort v14, v[17:18] -; GFX8-NEXT: flat_load_ushort v11, v[21:22] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v0 -; GFX8-NEXT: flat_load_ushort v15, v[15:16] -; GFX8-NEXT: flat_load_ushort v13, v[19:20] -; GFX8-NEXT: flat_load_ushort v8, v[23:24] -; GFX8-NEXT: flat_load_ushort v6, v[25:26] -; GFX8-NEXT: flat_load_ushort v5, v[27:28] -; GFX8-NEXT: flat_load_ushort v7, v[29:30] -; GFX8-NEXT: flat_load_ushort v12, v[31:32] -; GFX8-NEXT: flat_load_ushort v16, v[33:34] -; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xc4, v0 -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xbc, v0 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xb4, v0 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xac, v0 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xa4, v0 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x9c, v0 +; GFX8-NEXT: flat_load_ushort v58, v[39:40] +; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1 +; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1 +; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v42, v[42:43] +; GFX8-NEXT: flat_load_ushort v34, v[33:34] +; GFX8-NEXT: flat_load_ushort v36, v[35:36] +; GFX8-NEXT: flat_load_ushort v38, v[37:38] +; GFX8-NEXT: flat_load_ushort v39, v[48:49] +; GFX8-NEXT: flat_load_ushort v48, v[50:51] +; GFX8-NEXT: flat_load_ushort v51, v[52:53] +; GFX8-NEXT: flat_load_ushort v52, v[54:55] +; GFX8-NEXT: flat_load_ushort v53, v[40:41] +; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1 +; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v37, v[3:4] +; GFX8-NEXT: flat_load_ushort v35, v[5:6] +; GFX8-NEXT: flat_load_ushort v33, v[7:8] +; GFX8-NEXT: flat_load_ushort v8, v[9:10] +; GFX8-NEXT: flat_load_ushort v6, v[11:12] +; GFX8-NEXT: flat_load_ushort v4, v[13:14] +; GFX8-NEXT: flat_load_ushort v2, v[15:16] +; GFX8-NEXT: flat_load_ushort v1, v[19:20] +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0 ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfc, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf8, v0 -; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf4, v0 -; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf0, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xe8, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX8-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe4, v0 -; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xdc, v0 -; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd8, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd4, v0 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd0, v0 -; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xcc, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc8, v0 -; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 +; GFX8-NEXT: flat_load_ushort v3, v[17:18] +; GFX8-NEXT: flat_load_ushort v5, v[21:22] +; GFX8-NEXT: flat_load_ushort v7, v[23:24] +; GFX8-NEXT: flat_load_ushort v9, v[25:26] +; GFX8-NEXT: flat_load_ushort v10, v[27:28] +; GFX8-NEXT: flat_load_ushort v11, v[29:30] +; GFX8-NEXT: flat_load_ushort v12, v[31:32] +; GFX8-NEXT: flat_load_ushort v13, v[49:50] +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v51 -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; GFX8-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xc0, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v17 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xb8, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v19 -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v48 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xb0, v0 -; GFX8-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xa8, v0 -; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v23 -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v36 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 -; GFX8-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xa0, v0 -; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v35 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v25 -; GFX8-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v10 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x98, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x94, v0 -; GFX8-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x90, v0 -; GFX8-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v14 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x8c, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; GFX8-NEXT: buffer_store_dword v28, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x88, v0 -; GFX8-NEXT: buffer_store_dword v27, v14, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x84, v0 -; GFX8-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x80, v0 -; GFX8-NEXT: buffer_store_dword v27, v13, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v9 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7c, v0 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX8-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x78, v0 -; GFX8-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x74, v0 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX8-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x70, v0 -; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x6c, v0 -; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x68, v0 -; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0 -; GFX8-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v12, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0 +; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0 -; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x50, v0 -; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 -; GFX8-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x48, v0 -; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0 -; GFX8-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0 -; GFX8-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0 -; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0 -; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 52, v0 -; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 48, v0 -; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 44, v0 -; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 40, v0 -; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 36, v0 -; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v0 -; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 28, v0 -; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 24, v0 -; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 20, v0 -; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 12, v0 +; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_extload_v32bf16_to_v32f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:62 -; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:60 -; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:58 -; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:56 -; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:54 -; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:52 -; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:50 -; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:48 -; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:46 -; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:44 -; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:42 -; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:40 -; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:38 -; GFX9-NEXT: global_load_ushort v19, v[1:2], off -; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:36 -; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:4 -; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:34 -; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:32 -; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:6 -; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:30 +; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62 +; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60 +; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58 +; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56 +; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54 +; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52 +; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50 +; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48 +; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46 +; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44 +; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42 +; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40 +; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38 +; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36 +; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34 +; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32 +; GFX9-NEXT: global_load_ushort v25, v[1:2], off +; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2 +; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30 ; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16 ; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18 ; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20 ; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24 -; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:26 -; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:28 -; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:10 +; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24 +; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26 +; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28 +; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4 +; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6 +; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8 +; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10 ; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14 ; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11 ; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v27 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v23 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 ; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v29 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v25 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v26 -; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v27 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[27:28], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 +; GFX9-NEXT: s_waitcnt vmcnt(32) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v29 -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[29:30], v30 -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v31 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[31:32], v32 -; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:192 -; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160 -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20 +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 +; GFX9-NEXT: s_waitcnt vmcnt(43) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX9-NEXT: s_waitcnt vmcnt(38) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2 +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2 ; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v13 -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v14 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v11 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 ; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v21 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v22 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v23 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v12 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8612,179 +8612,177 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26 ; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28 ; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30 -; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:32 -; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:34 -; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:36 -; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:38 -; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:40 -; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:42 -; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:44 -; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:46 -; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:48 -; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:62 -; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:50 -; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:52 -; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:54 -; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:60 -; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:56 -; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:58 +; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62 +; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32 +; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34 +; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36 +; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60 +; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38 +; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40 +; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58 +; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42 +; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44 +; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56 +; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46 +; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48 +; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54 +; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50 +; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52 ; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3 ; GFX10-NEXT: s_waitcnt vmcnt(30) -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4 ; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(28) -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6 ; GFX10-NEXT: s_waitcnt vmcnt(27) -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7 ; GFX10-NEXT: s_waitcnt vmcnt(26) -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8 ; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9 ; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 ; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11 ; GFX10-NEXT: s_waitcnt vmcnt(22) -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12 ; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13 ; GFX10-NEXT: s_waitcnt vmcnt(20) -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14 -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; GFX10-NEXT: s_waitcnt vmcnt(18) -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v16 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v37 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v38 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36 +; GFX10-NEXT: s_waitcnt vmcnt(17) +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17 +; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18 ; GFX10-NEXT: s_waitcnt vmcnt(15) -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19 ; GFX10-NEXT: s_waitcnt vmcnt(14) ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20 ; GFX10-NEXT: s_waitcnt vmcnt(13) -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v21 +; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21 ; GFX10-NEXT: s_waitcnt vmcnt(12) -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22 ; GFX10-NEXT: s_waitcnt vmcnt(11) -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v23 -; GFX10-NEXT: s_waitcnt vmcnt(10) -; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25 ; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26 ; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v27 -; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v30 -; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v29 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v84 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v50 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v51 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v82 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v52 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v53 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v80 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v35 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v36 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v48 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v49 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v54 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v55 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[54:55], v70 -; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v17 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v81 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v71 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v65 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[64:65], v64 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:220 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:216 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v67 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[66:67], v66 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v69 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v39 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[68:69], v68 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184 -; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:180 -; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:176 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:172 -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:168 -; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:164 -; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:160 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:136 -; GFX10-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52 +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128 ; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124 ; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120 -; GFX10-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:116 -; GFX10-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:112 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 ; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108 ; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:100 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:96 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:92 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:88 -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:84 -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:80 -; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:76 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:72 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:68 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:64 -; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:60 -; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:56 -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:52 -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48 -; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:44 -; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:40 -; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36 -; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:28 -; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24 -; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:20 -; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:16 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:12 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 +; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 +; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 +; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 +; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 +; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_extload_v32bf16_to_v32f64: @@ -10059,55 +10057,47 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_add_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_add_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -10116,6 +10106,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_add_f32_e32 v10, v10, v26 +; GCN-NEXT: v_add_f32_e32 v9, v9, v25 +; GCN-NEXT: v_add_f32_e32 v8, v8, v24 +; GCN-NEXT: v_add_f32_e32 v7, v7, v23 +; GCN-NEXT: v_add_f32_e32 v6, v6, v22 +; GCN-NEXT: v_add_f32_e32 v5, v5, v21 +; GCN-NEXT: v_add_f32_e32 v4, v4, v20 ; GCN-NEXT: v_add_f32_e32 v3, v3, v19 ; GCN-NEXT: v_add_f32_e32 v2, v2, v18 ; GCN-NEXT: v_add_f32_e32 v1, v1, v17 @@ -10135,7 +10133,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_add_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -10145,20 +10143,22 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -10169,25 +10169,24 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -10212,7 +10211,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_add_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v24 @@ -10231,7 +10229,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -11689,10 +11687,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -11995,278 +11993,278 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_add_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_add_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_add_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_add_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_add_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_add_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_add_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_add_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_add_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_add_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_add_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_add_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_add_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_add_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_add_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_add_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_add_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_add_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_add_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_add_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_add_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_add_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_add_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_add_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_add_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_add_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_add_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_add_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_add_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_add_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_add_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14496,55 +14494,47 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -14553,6 +14543,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 +; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 +; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 +; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 +; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 +; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 +; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v19 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v18 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v17 @@ -14572,7 +14570,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -14582,20 +14580,22 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -14606,25 +14606,24 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -14649,7 +14648,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24 @@ -14668,7 +14666,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -16126,10 +16124,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -16432,278 +16430,278 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_mul_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_mul_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_mul_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_mul_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_mul_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_mul_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_mul_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_mul_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_mul_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_mul_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_mul_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18574,55 +18572,47 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_min_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_min_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_min_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_min_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_min_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_min_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_min_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_min_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_min_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -18631,6 +18621,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_min_f32_e32 v10, v10, v26 +; GCN-NEXT: v_min_f32_e32 v9, v9, v25 +; GCN-NEXT: v_min_f32_e32 v8, v8, v24 +; GCN-NEXT: v_min_f32_e32 v7, v7, v23 +; GCN-NEXT: v_min_f32_e32 v6, v6, v22 +; GCN-NEXT: v_min_f32_e32 v5, v5, v21 +; GCN-NEXT: v_min_f32_e32 v4, v4, v20 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 @@ -18650,7 +18648,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -18660,20 +18658,22 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -18684,25 +18684,24 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -18727,7 +18726,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 @@ -18746,7 +18744,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -20204,10 +20202,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -20510,278 +20508,278 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_min_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_min_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_min_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_min_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_min_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_min_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_min_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_min_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_min_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_min_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_min_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_min_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_min_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_min_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_min_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_min_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_min_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_min_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_min_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_min_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_min_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_min_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_min_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_min_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_min_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_min_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_min_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_min_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_min_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_min_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_min_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_min_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22193,55 +22191,47 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_max_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_max_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_max_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_max_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_max_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_max_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_max_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_max_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_max_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -22250,6 +22240,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_max_f32_e32 v10, v10, v26 +; GCN-NEXT: v_max_f32_e32 v9, v9, v25 +; GCN-NEXT: v_max_f32_e32 v8, v8, v24 +; GCN-NEXT: v_max_f32_e32 v7, v7, v23 +; GCN-NEXT: v_max_f32_e32 v6, v6, v22 +; GCN-NEXT: v_max_f32_e32 v5, v5, v21 +; GCN-NEXT: v_max_f32_e32 v4, v4, v20 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 @@ -22269,7 +22267,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -22279,20 +22277,22 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -22303,25 +22303,24 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -22346,7 +22345,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 @@ -22365,7 +22363,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -23823,10 +23821,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -24129,278 +24127,278 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_max_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_max_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_max_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_max_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_max_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_max_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_max_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_max_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_max_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_max_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_max_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_max_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_max_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_max_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_max_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_max_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_max_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_max_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_max_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_max_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_max_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_max_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_max_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_max_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_max_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_max_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_max_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_max_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_max_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_max_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_max_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -35701,81 +35699,81 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-LABEL: v_select_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16 ; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16 +; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16 ; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -35808,67 +35806,67 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23 ; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27 +; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29 -; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -35877,21 +35875,21 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -37187,30 +37185,30 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GCN-LABEL: v_vselect_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-NEXT: v_and_b32_e32 v5, 1, v5 +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v3, 1, v3 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v5, 1, v5 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v6, 1, v6 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 @@ -37241,45 +37239,45 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -37548,16 +37546,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v1, 1, v10 ; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v3, 1, v11 -; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v5, 1, v12 -; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5 +; GCN-NEXT: v_and_b32_e32 v2, 1, v11 +; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v3, 1, v12 +; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3 ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v7, 1, v13 ; GCN-NEXT: v_and_b32_e32 v8, 1, v14 ; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 @@ -37624,22 +37622,22 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13] ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 @@ -37665,151 +37663,136 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-LABEL: v_vselect_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v7 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 +; GFX7-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v15 +; GFX7-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v14 +; GFX7-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v13 +; GFX7-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v12 +; GFX7-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 +; GFX7-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v13 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX7-NEXT: v_writelane_b32 v31, s34, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v5, 1, v15 -; GFX7-NEXT: v_writelane_b32 v31, s35, 3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5 -; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31] -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29] -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27] -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25] -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[12:13] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_readlane_b32 s35, v31, 3 -; GFX7-NEXT: v_readlane_b32 s34, v31, 2 -; GFX7-NEXT: v_readlane_b32 s31, v31, 1 -; GFX7-NEXT: v_readlane_b32 s30, v31, 0 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19] -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17] -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15] -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v8, v7, s[10:11] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v13, v8, v7, s[8:9] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v28 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[6:7] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v8, v7, s[4:5] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v10, v8, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21 +; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v9, v8, v7, s[18:19] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[16:17] +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23 +; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[14:15] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9] +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v18, v16, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v16bf16: @@ -37840,53 +37823,51 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 ; GFX8-NEXT: v_writelane_b32 v31, s30, 0 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v13 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 ; GFX8-NEXT: v_writelane_b32 v31, s31, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v30 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 ; GFX8-NEXT: v_writelane_b32 v31, s34, 2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[28:29] -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 ; GFX8-NEXT: v_writelane_b32 v31, s35, 3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21] +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v2, s[20:21] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v21 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v29 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[24:25] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v4, s[16:17] ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v27, v19, s[14:15] -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v26, v18, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7] -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v12, v0, v23, s[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13] @@ -37899,11 +37880,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11 ; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readlane_b32 s35, v31, 3 ; GFX8-NEXT: v_readlane_b32 s34, v31, 2 ; GFX8-NEXT: v_readlane_b32 s31, v31, 1 @@ -37917,81 +37900,81 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX9-LABEL: v_vselect_v16bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v4 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 1, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v8 -; GFX9-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[12:13] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[16:17] -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v30, v22, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v15, v26, v18, s[18:19] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v20, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v5, v5, v8, s4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v14, v4, v23, s[20:21] -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v4, v13, s[22:23] -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[24:25] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v4, v15, s4 -; GFX9-NEXT: v_perm_b32 v3, v11, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v9, v10, s4 -; GFX9-NEXT: v_perm_b32 v7, v13, v14, s4 +; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4 +; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4 +; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4 +; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v16bf16: @@ -38008,13 +37991,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20 ; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 @@ -38023,13 +38006,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v25 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v24 +; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v24 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo @@ -38048,11 +38031,11 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v51, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v30, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo @@ -38065,12 +38048,12 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 -; GFX10-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 -; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v16bf16: @@ -39456,219 +39439,206 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-LABEL: v_vselect_v32bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX10-NEXT: s_clause 0xa +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_ushort v35, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; GFX10-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX10-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX10-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX10-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v18 ; GFX10-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v19 ; GFX10-NEXT: v_and_b32_e32 v26, 1, v26 ; GFX10-NEXT: v_and_b32_e32 v24, 1, v24 ; GFX10-NEXT: v_and_b32_e32 v22, 1, v22 ; GFX10-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX10-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX10-NEXT: v_and_b32_e32 v21, 1, v21 ; GFX10-NEXT: v_and_b32_e32 v16, 1, v16 ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX10-NEXT: s_clause 0x14 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_ushort v33, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:80 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:28 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v30 -; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v28 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v26 -; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v24 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v22 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 -; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v20 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v18 -; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s11, 1, v16 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; GFX10-NEXT: v_cmp_eq_u32_e64 s12, 1, v14 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v12 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX10-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX10-NEXT: v_and_b32_e32 v15, 1, v15 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX10-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX10-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX10-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX10-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX10-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX10-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX10-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX10-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX10-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 1, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s15, 1, v8 -; GFX10-NEXT: v_cmp_eq_u32_e64 s16, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27 -; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25 -; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23 -; GFX10-NEXT: v_cmp_eq_u32_e64 s23, 1, v21 -; GFX10-NEXT: v_cmp_eq_u32_e64 s24, 1, v19 -; GFX10-NEXT: v_cmp_eq_u32_e64 s25, 1, v17 -; GFX10-NEXT: v_cmp_eq_u32_e64 s26, 1, v15 -; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13 -; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11 -; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9 -; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GFX10-NEXT: s_waitcnt vmcnt(30) -; GFX10-NEXT: v_and_b32_e32 v2, 1, v33 -; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GFX10-NEXT: s_waitcnt vmcnt(28) -; GFX10-NEXT: v_cndmask_b32_e64 v15, v34, v35, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v32, v31, s5 -; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_cndmask_b32_e64 v19, v37, v38, s7 -; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v39 -; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, v48, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v48 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v38 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v36 -; GFX10-NEXT: s_waitcnt vmcnt(18) -; GFX10-NEXT: v_cndmask_b32_e64 v27, v52, v53, s10 -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v54 -; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: v_cndmask_b32_e64 v21, v54, v55, s9 -; GFX10-NEXT: s_waitcnt vmcnt(15) -; GFX10-NEXT: v_cndmask_b32_e64 v11, v64, v36, s8 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v64 -; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v55 -; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53 -; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v52 -; GFX10-NEXT: v_cndmask_b32_e64 v33, v50, v51, s11 -; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v51 -; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v50 +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_cndmask_b32_e64 v36, v30, v49, s12 -; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49 -; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v29, v68, s13 -; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v68 -; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v33 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_cndmask_b32_e64 v18, v34, v33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_cndmask_b32_e64 v49, v24, v22, s15 -; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_cndmask_b32_e64 v50, v67, v20, s16 -; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v67 +; GFX10-NEXT: v_and_b32_e32 v35, 1, v35 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_cndmask_b32_e64 v52, v66, v18, s17 -; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e64 v48, v28, v26, s14 -; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v16, s18 -; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v65 +; GFX10-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v35 +; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v34 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v32, v31, s6 +; GFX10-NEXT: s_clause 0x6 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; GFX10-NEXT: v_cndmask_b32_e64 v30, v50, v30, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX10-NEXT: v_and_b32_e32 v28, 1, v29 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s5 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v52 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v64, v14, v12, s19 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v65, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v66, v6, v5, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v67, v8, v7, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v10, v9, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v25, v23, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v32, v31, s24 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v35, v34, s25 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v37, s26 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi -; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4 -; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100 -; GFX10-NEXT: v_perm_b32 v2, v2, v52, 0x5040100 -; GFX10-NEXT: v_perm_b32 v3, v20, v50, 0x5040100 -; GFX10-NEXT: v_perm_b32 v4, v12, v49, 0x5040100 -; GFX10-NEXT: v_perm_b32 v5, v5, v48, 0x5040100 -; GFX10-NEXT: v_perm_b32 v6, v6, v38, 0x5040100 -; GFX10-NEXT: v_perm_b32 v7, v7, v36, 0x5040100 -; GFX10-NEXT: v_perm_b32 v8, v8, v33, 0x5040100 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x5040100 -; GFX10-NEXT: v_perm_b32 v10, v10, v21, 0x5040100 -; GFX10-NEXT: v_perm_b32 v11, v68, v11, 0x5040100 -; GFX10-NEXT: v_perm_b32 v12, v67, v19, 0x5040100 -; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100 -; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100 -; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v29, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v28, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 +; GFX10-NEXT: v_and_b32_e32 v26, 1, v27 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 +; GFX10-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 +; GFX10-NEXT: v_and_b32_e32 v24, 1, v25 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 +; GFX10-NEXT: v_cndmask_b32_e32 v24, v36, v37, vcc_lo +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 +; GFX10-NEXT: v_and_b32_e32 v22, 1, v23 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v23, v49, v36, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v53 +; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v36, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v48 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v39 +; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; GFX10-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v16, v36, v37, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v38, v39, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v38, v39, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cndmask_b32_e32 v10, v36, v37, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v8, v38, v39, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v53, v48, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v34, v52, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v32, v33, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v37, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v49, v48, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v34, v50, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v30, v12, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v39, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v22, v23, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX10-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 +; GFX10-NEXT: v_perm_b32 v9, v13, v18, 0x5040100 +; GFX10-NEXT: v_perm_b32 v12, v24, v25, 0x5040100 +; GFX10-NEXT: v_perm_b32 v13, v26, v27, 0x5040100 +; GFX10-NEXT: v_perm_b32 v14, v28, v29, 0x5040100 +; GFX10-NEXT: v_perm_b32 v15, v35, v54, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v32bf16: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index dd9c9a3699b4ff..05c2e0077f4aea 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s44, s[8:9], 0x2 +; CHECK-NEXT: s_load_dword s27, s[8:9], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] ; CHECK-NEXT: s_add_u32 s96, s96, s15 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_eq_u32 s44, 0 +; CHECK-NEXT: s_cmp_eq_u32 s27, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND @@ -971,10 +971,10 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v1, s98, 3 ; CHECK-NEXT: v_writelane_b32 v0, s92, 61 ; CHECK-NEXT: v_writelane_b32 v1, s99, 4 -; CHECK-NEXT: s_mov_b32 s49, s12 +; CHECK-NEXT: s_mov_b32 s31, s12 ; CHECK-NEXT: v_writelane_b32 v0, s93, 62 ; CHECK-NEXT: v_writelane_b32 v1, s100, 5 -; CHECK-NEXT: s_cmp_eq_u32 s49, 0 +; CHECK-NEXT: s_cmp_eq_u32 s31, 0 ; CHECK-NEXT: v_writelane_b32 v0, s94, 63 ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 ; CHECK-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 4c7a4ba3a44a5f..cdfaed0a203e92 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2626,42 +2626,42 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: load_v32i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: buffer_load_dwordx4 v[33:36], off, s[16:19], 0 -; SDAG-NEXT: buffer_load_dwordx4 v[48:51], off, s[16:19], 0 offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[36:39], off, s[16:19], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[32:35], off, s[16:19], 0 offset:16 ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[33:34] -; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[35:36] -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_lshrrev_b64 v[19:20], 24, v[48:49] -; SDAG-NEXT: v_lshrrev_b64 v[27:28], 24, v[50:51] -; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v34 -; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v34 -; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v34 -; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v35 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v35 -; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v36 -; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v36 -; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v36 -; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v48 -; SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v48 -; SDAG-NEXT: v_lshrrev_b32_e32 v21, 8, v49 -; SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v49 -; SDAG-NEXT: v_lshrrev_b32_e32 v23, 24, v49 -; SDAG-NEXT: v_lshrrev_b32_e32 v25, 8, v50 -; SDAG-NEXT: v_lshrrev_b32_e32 v26, 16, v50 -; SDAG-NEXT: v_lshrrev_b32_e32 v29, 8, v51 -; SDAG-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; SDAG-NEXT: v_lshrrev_b32_e32 v31, 24, v51 -; SDAG-NEXT: v_mov_b32_e32 v0, v33 -; SDAG-NEXT: v_mov_b32_e32 v4, v34 -; SDAG-NEXT: v_mov_b32_e32 v8, v35 -; SDAG-NEXT: v_mov_b32_e32 v12, v36 -; SDAG-NEXT: v_mov_b32_e32 v16, v48 -; SDAG-NEXT: v_mov_b32_e32 v20, v49 -; SDAG-NEXT: v_mov_b32_e32 v24, v50 -; SDAG-NEXT: v_mov_b32_e32 v28, v51 +; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[36:37] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[38:39] +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; SDAG-NEXT: v_lshrrev_b64 v[27:28], 24, v[34:35] +; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v37 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v37 +; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v37 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v38 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v39 +; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v39 +; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; SDAG-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; SDAG-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; SDAG-NEXT: v_lshrrev_b32_e32 v25, 8, v34 +; SDAG-NEXT: v_lshrrev_b32_e32 v26, 16, v34 +; SDAG-NEXT: v_lshrrev_b32_e32 v29, 8, v35 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 24, v35 +; SDAG-NEXT: v_mov_b32_e32 v0, v36 +; SDAG-NEXT: v_mov_b32_e32 v4, v37 +; SDAG-NEXT: v_mov_b32_e32 v8, v38 +; SDAG-NEXT: v_mov_b32_e32 v12, v39 +; SDAG-NEXT: v_mov_b32_e32 v16, v32 +; SDAG-NEXT: v_mov_b32_e32 v20, v33 +; SDAG-NEXT: v_mov_b32_e32 v24, v34 +; SDAG-NEXT: v_mov_b32_e32 v28, v35 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: load_v32i8: @@ -2717,47 +2717,47 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: store_v32i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 ; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15 -; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 ; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; SDAG-NEXT: v_lshlrev_b16_e32 v7, 8, v7 ; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7 +; SDAG-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11 -; SDAG-NEXT: v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v25 -; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; SDAG-NEXT: v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; SDAG-NEXT: v_lshlrev_b16_e32 v2, 8, v23 -; SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v17 -; SDAG-NEXT: v_lshlrev_b16_e32 v15, 8, v19 -; SDAG-NEXT: v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; SDAG-NEXT: v_lshlrev_b16_e32 v14, 8, v25 +; SDAG-NEXT: v_lshlrev_b16_e32 v15, 8, v27 +; SDAG-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; SDAG-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; SDAG-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; SDAG-NEXT: v_lshlrev_b16_e32 v19, 8, v19 ; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 -; SDAG-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v4, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v11, v24, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v14, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v5, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v4, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v3, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; SDAG-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: buffer_store_dwordx4 v[3:6], off, s[16:19], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 0009a84765639c..56ecfa298a348f 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -2487,10 +2487,10 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; SI-NEXT: v_or_b32_e32 v1, v31, v1 ; SI-NEXT: v_or_b32_e32 v5, v27, v5 ; SI-NEXT: v_or_b32_e32 v9, v23, v9 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_and_b32_e32 v17, 3, v28 ; SI-NEXT: v_and_b32_e32 v18, 3, v24 -; SI-NEXT: v_and_b32_e32 v20, 3, v20 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 +; SI-NEXT: v_and_b32_e32 v19, 3, v20 ; SI-NEXT: v_and_b32_e32 v16, 3, v16 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_and_b32_e32 v12, 3, v12 @@ -2502,7 +2502,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; SI-NEXT: v_and_b32_e32 v0, 3, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v3, v18, v5 -; SI-NEXT: v_or_b32_e32 v5, v20, v9 +; SI-NEXT: v_or_b32_e32 v5, v19, v9 ; SI-NEXT: v_or_b32_e32 v7, v16, v13 ; SI-NEXT: v_or_b32_e32 v9, v12, v14 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir index 00eb2b7e1aa8dd..4945c7020ca18c 100644 --- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir +++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir @@ -49,39 +49,39 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec - ; CHECK-NEXT: dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] - ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1 + ; CHECK-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF3]].sub0, [[DEF5]].sub0, 0, implicit $exec + ; CHECK-NEXT: dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF3]].sub1, [[DEF5]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF8]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] + ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub1 ; CHECK-NEXT: dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]].sub0 - ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF6]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF7]], 288, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef [[DEF5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]] + ; CHECK-NEXT: undef [[DEF4:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]] ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir index cdd4c72f3717f0..8a1c68b3f66150 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir @@ -24,7 +24,7 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -32,10 +32,9 @@ body: | ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -51,33 +50,34 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec - ; CHECK-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF12]], implicit $mode, implicit $exec ; CHECK-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF13:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: $sgpr4 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = COPY [[DEF11]] + ; CHECK-NEXT: $vgpr0 = COPY [[DEF10]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: $vgpr1 = COPY [[DEF7]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF6]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MUL_F32_e32_1]] ; CHECK-NEXT: $vgpr1 = COPY [[V_MUL_F32_e32_2]] ; CHECK-NEXT: $vgpr2 = COPY [[V_MUL_F32_e32_3]] - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 - ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF13]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 + ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF11]], [[DEF8]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF3]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF14]], [[DEF9]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index d9182d7ace8bfe..59bc7f332bf1e4 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -152,38 +152,38 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v14, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v5, v15, v31 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v3 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v15 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v3 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v14 ; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v21 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v20 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc -; GFX9-NEXT: v_and_b32_e32 v14, v30, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v14, vcc +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 +; GFX9-NEXT: v_and_b32_e32 v6, v30, v0 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v6, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc -; GFX9-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v14, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 ; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 @@ -1227,13 +1227,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc ; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc ; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 ; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 @@ -1245,8 +1245,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 ; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 -; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 +; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v10 +; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v11 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 ; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 @@ -1258,15 +1258,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] ; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v8 ; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] -; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9 ; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v10 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v11 ; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 ; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] @@ -1291,10 +1291,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 ; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 @@ -1309,23 +1309,23 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc ; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 -; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] -; GFX9-G-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 -; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v12 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v12, v[10:11] +; GFX9-G-NEXT: v_add_u32_e32 v13, 0xffffffc0, v12 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v12, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] -; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v13, v[8:9] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 ; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc -; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GFX9-G-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 @@ -1336,13 +1336,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] ; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -1352,54 +1352,54 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] -; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v13 ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] -; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v11 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v24, v2 ; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc -; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12 -; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12 -; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19 -; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v25, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v26, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v27, v1, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v10 +; GFX9-G-NEXT: v_and_b32_e32 v10, v28, v18 +; GFX9-G-NEXT: v_and_b32_e32 v11, v28, v19 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v10 +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v11, vcc ; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 +; GFX9-G-NEXT: v_and_b32_e32 v3, v28, v5 ; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc -; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5 -; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v3, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8 +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow @@ -1407,9 +1407,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: .LBB0_5: ; %Flow2 ; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 +; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v4 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 ; GFX9-G-NEXT: .LBB0_6: ; %Flow3 @@ -1418,9 +1418,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v12, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v4, v13, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -2439,16 +2439,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshlrev_b64 v[26:27], 1, v[10:11] ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_or_b32_e32 v10, v16, v26 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v10, v16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v11, v17, v27 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v17 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v16 ; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v1, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v24, v2, vcc @@ -2457,20 +2456,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_and_b32_e32 v16, v26, v4 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v16 ; GFX9-NEXT: v_and_b32_e32 v16, v26, v5 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v16, v26, v6 -; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 -; GFX9-NEXT: v_and_b32_e32 v12, v26, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v16, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v12, vcc +; GFX9-NEXT: v_and_b32_e32 v16, v26, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc +; GFX9-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX9-NEXT: v_or_b32_e32 v17, v19, v21 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 ; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 @@ -3506,37 +3506,37 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15 ; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12 ; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13 -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17] -; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v3 -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v2 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v9 -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-G-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 -; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v22, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v23, v11, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v24, v12, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v25, v13, vcc -; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18 -; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5] -; GFX9-G-NEXT: v_and_b32_e32 v10, v3, v4 -; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5] -; GFX9-G-NEXT: v_and_b32_e32 v16, v3, v5 -; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10 -; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5] -; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-G-NEXT: v_and_b32_e32 v17, v3, v6 -; GFX9-G-NEXT: v_and_b32_e32 v26, v3, v7 -; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v11, v16, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[16:17] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v22, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v23, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v24, v10, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v11, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v12, 31, v12 +; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v4 +; GFX9-G-NEXT: v_and_b32_e32 v16, v12, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v13 +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v6 +; GFX9-G-NEXT: v_and_b32_e32 v17, v12, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v10, v13, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v11, v17, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 +; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 ; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 -; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 +; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12 ; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc -; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 691f3d36bc7360..8d65fa053eaa49 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -6,430 +6,430 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_sdiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; SDAG-NEXT: v_ashrrev_i32_e32 v27, 31, v11 ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v28, v26 -; SDAG-NEXT: v_mov_b32_e32 v29, v27 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v26, v24 +; SDAG-NEXT: v_mov_b32_e32 v27, v25 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v16 -; SDAG-NEXT: v_ffbh_u32_e32 v18, v17 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 0, v8 -; SDAG-NEXT: v_or_b32_e32 v0, v16, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v2 -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v1 +; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v20 +; SDAG-NEXT: v_ffbh_u32_e32 v2, v21 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 +; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 +; SDAG-NEXT: v_or_b32_e32 v1, v21, v17 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v17, v3 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v21 -; SDAG-NEXT: v_min_u32_e32 v18, v22, v18 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v3 -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v20, s[4:5] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_min_u32_e32 v1, v21, v22 -; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v9, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v18, v8, v1, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v20, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v8, v31, v0 +; SDAG-NEXT: v_min_u32_e32 v2, v19, v2 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] +; SDAG-NEXT: v_min_u32_e32 v1, v19, v22 +; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2 +; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v3, v29 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v28 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] +; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v9 -; SDAG-NEXT: v_or_b32_e32 v9, v30, v1 +; SDAG-NEXT: v_or_b32_e32 v3, v28, v1 +; SDAG-NEXT: v_min_u32_e32 v8, v8, v19 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_min_u32_e32 v20, v20, v21 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v11, v21 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v20 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v10, vcc -; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 -; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v9, v19 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v19, v1 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_min_u32_e32 v2, v11, v19 +; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 +; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2 +; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v18, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v18, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v3, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_and_b32_e32 v10, 1, v20 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v2, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc +; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2 +; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc +; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2 +; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_lshr_b64 v[10:11], v[20:21], v35 +; SDAG-NEXT: v_or_b32_e32 v3, v3, v11 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc -; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 -; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8 -; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[2:3], v24 -; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 -; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 -; SDAG-NEXT: v_sub_i32_e32 v37, vcc, 64, v32 -; SDAG-NEXT: v_subrev_i32_e32 v48, vcc, 64, v32 -; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32 -; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_lshl_b64 v[38:39], v[2:3], v37 -; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v48 -; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v11, v39 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v38 -; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v0, vcc -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v25, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v24, s[4:5] -; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v1, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v30 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5] +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30 +; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v25, 31, v21 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v10 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v24 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v25 -; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v2 -; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v16, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v17, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v10 -; SDAG-NEXT: v_and_b32_e32 v25, v24, v31 -; SDAG-NEXT: v_and_b32_e32 v48, v24, v30 -; SDAG-NEXT: v_and_b32_e32 v49, v24, v0 -; SDAG-NEXT: v_and_b32_e32 v10, 1, v24 -; SDAG-NEXT: v_and_b32_e32 v50, v24, v1 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v25 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v16, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v17, v50, vcc -; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_or_b32_e32 v19, v17, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v16, v18 +; SDAG-NEXT: v_or_b32_e32 v16, v22, v38 +; SDAG-NEXT: v_or_b32_e32 v17, v20, v39 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v8 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; SDAG-NEXT: v_and_b32_e32 v20, v8, v29 +; SDAG-NEXT: v_and_b32_e32 v22, v8, v28 +; SDAG-NEXT: v_and_b32_e32 v38, v8, v0 +; SDAG-NEXT: v_and_b32_e32 v39, v8, v1 +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20 +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v32, v34 -; SDAG-NEXT: v_or_b32_e32 v17, v33, v35 +; SDAG-NEXT: v_or_b32_e32 v16, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v17, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 -; SDAG-NEXT: v_mov_b32_e32 v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v22, v10 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v3, v11, v3 +; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v2, v10, v2 +; SDAG-NEXT: v_mov_b32_e32 v17, v9 +; SDAG-NEXT: v_mov_b32_e32 v16, v8 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v20, v19, v1 -; SDAG-NEXT: v_or_b32_e32 v22, v11, v3 -; SDAG-NEXT: v_or_b32_e32 v21, v18, v0 -; SDAG-NEXT: v_or_b32_e32 v23, v10, v2 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v11, v1 +; SDAG-NEXT: v_or_b32_e32 v19, v9, v3 +; SDAG-NEXT: v_or_b32_e32 v22, v10, v0 +; SDAG-NEXT: v_or_b32_e32 v23, v8, v2 ; SDAG-NEXT: .LBB0_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v18, v16 -; SDAG-NEXT: v_mov_b32_e32 v19, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, v16 +; SDAG-NEXT: v_mov_b32_e32 v21, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v6, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12 -; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 -; SDAG-NEXT: v_ffbh_u32_e32 v8, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v4, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12 +; SDAG-NEXT: v_or_b32_e32 v0, v2, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v6 ; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v8 -; SDAG-NEXT: v_ffbh_u32_e32 v30, v5 -; SDAG-NEXT: v_min_u32_e32 v6, v10, v6 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v7 +; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v7 +; SDAG-NEXT: v_min_u32_e32 v4, v10, v4 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e64 v24, v13, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v25, v12, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_min_u32_e32 v1, v8, v30 -; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6 -; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v15, vcc +; SDAG-NEXT: v_min_u32_e32 v1, v9, v30 +; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4 +; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v10, v25 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v24 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v7, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v13, v6, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v8, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v6, v25, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v8, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v10, v29 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v28 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v4, v29, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v0 ; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 -; SDAG-NEXT: v_or_b32_e32 v7, v24, v1 -; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v8 +; SDAG-NEXT: v_or_b32_e32 v5, v28, v1 +; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 ; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; SDAG-NEXT: v_min_u32_e32 v6, v8, v14 -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_min_u32_e32 v4, v9, v14 +; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 -; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4 +; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v8, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v8, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v9, v10 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v7, v9 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_or_b32_e32 v9, v5, v11 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_and_b32_e32 v10, 1, v12 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v12 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 -; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 -; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4 +; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc ; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v7, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v9, vcc, 0x7f, v6 -; SDAG-NEXT: v_or_b32_e32 v8, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v9 -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, 64, v9 -; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v9 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] -; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6 -; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 -; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9 -; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4 +; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35 +; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 -; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v25 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35 -; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36 -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v11, v49 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v48 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35 +; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v9, v49 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v7 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v8 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 -; SDAG-NEXT: v_or_b32_e32 v9, v13, v9 -; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 -; SDAG-NEXT: v_or_b32_e32 v8, v12, v8 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v2 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v4, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v5, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v15, 31, v10 -; SDAG-NEXT: v_and_b32_e32 v10, 1, v15 -; SDAG-NEXT: v_and_b32_e32 v38, v15, v1 -; SDAG-NEXT: v_and_b32_e32 v39, v15, v0 -; SDAG-NEXT: v_and_b32_e32 v48, v15, v24 -; SDAG-NEXT: v_and_b32_e32 v15, v15, v25 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v39 +; SDAG-NEXT: v_or_b32_e32 v5, v13, v5 +; SDAG-NEXT: v_or_b32_e32 v11, v15, v11 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2 +; SDAG-NEXT: v_or_b32_e32 v4, v12, v4 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; SDAG-NEXT: v_and_b32_e32 v15, v8, v29 +; SDAG-NEXT: v_and_b32_e32 v38, v8, v28 +; SDAG-NEXT: v_and_b32_e32 v39, v8, v0 +; SDAG-NEXT: v_and_b32_e32 v48, v8, v1 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 -; SDAG-NEXT: v_mov_b32_e32 v15, v11 -; SDAG-NEXT: v_mov_b32_e32 v14, v10 +; SDAG-NEXT: v_or_b32_e32 v10, v14, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 -; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 -; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 +; SDAG-NEXT: v_or_b32_e32 v14, v9, v3 +; SDAG-NEXT: v_or_b32_e32 v9, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 ; SDAG-NEXT: .LBB0_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_xor_b32_e32 v3, v29, v28 -; SDAG-NEXT: v_xor_b32_e32 v2, v27, v26 -; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 +; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 +; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20 ; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16 -; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3 -; SDAG-NEXT: v_xor_b32_e32 v5, v21, v2 -; SDAG-NEXT: v_xor_b32_e32 v1, v22, v3 +; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3 +; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 -; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7 -; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6 +; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7 +; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6 ; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6 +; SDAG-NEXT: v_xor_b32_e32 v4, v8, v6 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: v_sdiv_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0 ; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1 ; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2 @@ -438,71 +438,71 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v9, v25, v9 ; GISEL-NEXT: v_xor_b32_e32 v10, v25, v10 ; GISEL-NEXT: v_xor_b32_e32 v11, v25, v11 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v24 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v24, vcc +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v24 +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v24, vcc ; GISEL-NEXT: v_sub_i32_e64 v26, s[4:5], v8, v25 ; GISEL-NEXT: v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v24, vcc -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v3, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v24, vcc ; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v25, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v25, vcc ; GISEL-NEXT: v_ffbh_u32_e32 v8, v27 ; GISEL-NEXT: v_ffbh_u32_e32 v9, v26 -; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 -; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v18 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v19 ; GISEL-NEXT: v_or_b32_e32 v0, v26, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v27, v11 -; GISEL-NEXT: v_or_b32_e32 v2, v16, v18 -; GISEL-NEXT: v_or_b32_e32 v3, v17, v19 +; GISEL-NEXT: v_or_b32_e32 v2, v18, v20 +; GISEL-NEXT: v_or_b32_e32 v3, v19, v21 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v9 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v11 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v10 -; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v30, v19 -; GISEL-NEXT: v_ffbh_u32_e32 v31, v18 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, 32, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v30, v20 +; GISEL-NEXT: v_ffbh_u32_e32 v31, v21 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] ; GISEL-NEXT: v_min_u32_e32 v0, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v29 -; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 -; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v31 +; GISEL-NEXT: v_min_u32_e32 v1, v23, v22 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v28 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v30 +; GISEL-NEXT: v_min_u32_e32 v2, v29, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v31, v3 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 -; GISEL-NEXT: v_min_u32_e32 v1, v28, v1 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v30, v3 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 ; GISEL-NEXT: v_or_b32_e32 v9, v3, v1 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v9, v22, v20 -; GISEL-NEXT: v_and_b32_e32 v20, 1, v9 +; GISEL-NEXT: v_or_b32_e32 v9, v22, v16 ; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: v_and_b32_e32 v9, 1, v9 +; GISEL-NEXT: v_and_b32_e32 v8, 1, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v2 @@ -511,110 +511,111 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_not_b32_e32 v2, 63 ; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v32, v2 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v32, v2 ; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], v32 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], v32 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 +; GISEL-NEXT: v_lshr_b64 v[8:9], v[18:19], v8 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v16 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32 -; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v8, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v21, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_add_i32_e32 v34, vcc, 0xffffffc0, v28 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v28 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[20:21], v28 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[18:19], v28 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], v22 +; GISEL-NEXT: v_or_b32_e32 v22, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v23, v3, v23 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[20:21], v32 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v22, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v23, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, v2, v18, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v19, v3, v19, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26 -; GISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v28 ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc -; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v22 -; GISEL-NEXT: v_lshr_b64 v[36:37], v[18:19], v34 -; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v2, v22 -; GISEL-NEXT: v_or_b32_e32 v1, v3, v23 ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, v36, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v37, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v22, v0, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v23, v1, v17, s[6:7] -; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v23, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 ; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v21 +; GISEL-NEXT: v_lshrrev_b32_e32 v36, 31, v17 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 +; GISEL-NEXT: v_or_b32_e32 v16, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v17, v1, v3 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v19 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshl_b64 v[36:37], v[22:23], 1 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 -; GISEL-NEXT: v_lshrrev_b32_e32 v23, 31, v9 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v32, v0 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v33, v1, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v34, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v35, v3, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v22, 31, v18 +; GISEL-NEXT: v_and_b32_e32 v18, v22, v26 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v18 +; GISEL-NEXT: v_and_b32_e32 v0, v22, v27 +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, v22, v10 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, v22, v11 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28 ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v18, v22 -; GISEL-NEXT: v_or_b32_e32 v3, v36, v23 ; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc -; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v32, v3 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v37, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v28, v30 ; GISEL-NEXT: v_or_b32_e32 v1, v29, v31 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v34, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v19, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v16 +; GISEL-NEXT: v_and_b32_e32 v22, 1, v22 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GISEL-NEXT: v_and_b32_e32 v1, v0, v26 -; GISEL-NEXT: v_and_b32_e32 v18, v0, v27 -; GISEL-NEXT: v_and_b32_e32 v16, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v36, v0, v10 -; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 -; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1 -; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v16 -; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v36 +; GISEL-NEXT: v_mov_b32_e32 v0, v22 +; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB0_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB0_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21 +; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v17 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v10 -; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v22, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v23, v1, v3 ; GISEL-NEXT: .LBB0_6: ; %Flow16 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 @@ -630,18 +631,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc -; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19 -; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19 +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc ; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc ; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v14, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v15, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v14, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v20 ; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 ; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 -; GISEL-NEXT: v_or_b32_e32 v0, v22, v4 -; GISEL-NEXT: v_or_b32_e32 v1, v23, v5 +; GISEL-NEXT: v_or_b32_e32 v0, v20, v4 +; GISEL-NEXT: v_or_b32_e32 v1, v21, v5 ; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 ; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 @@ -732,8 +733,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22 -; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc ; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 ; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc @@ -782,8 +783,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v12, v0, v22 -; GISEL-NEXT: v_and_b32_e32 v13, v0, v23 +; GISEL-NEXT: v_and_b32_e32 v12, v0, v20 +; GISEL-NEXT: v_and_b32_e32 v13, v0, v21 ; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 ; GISEL-NEXT: v_mov_b32_e32 v0, v6 @@ -808,8 +809,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 ; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 -; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v21, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v22, v3 +; GISEL-NEXT: v_xor_b32_e32 v1, v23, v3 ; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3 ; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 ; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 @@ -853,11 +854,11 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 ; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 ; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 ; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 ; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 ; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 @@ -868,146 +869,146 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23 -; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24] +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22 +; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v25 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_or_b32_e32 v17, v23, v25 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v24, v26 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] -; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc ; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23 -; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23 +; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v22 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v22 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v23, vcc ; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc -; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v18, v28 -; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23 -; SDAG-NEXT: v_or_b32_e32 v20, v27, v29 -; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v24, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v26, v28 +; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22 +; SDAG-NEXT: v_or_b32_e32 v19, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 -; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] -; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31 -; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v30 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v31 ; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v16, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v24, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18 -; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18 -; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18 -; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18 +; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v22 +; SDAG-NEXT: v_or_b32_e32 v23, v21, v23 +; SDAG-NEXT: v_or_b32_e32 v22, v20, v22 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; SDAG-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v26 +; SDAG-NEXT: v_lshr_b64 v[20:21], v[2:3], v20 +; SDAG-NEXT: v_cndmask_b32_e32 v21, v21, v23, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v21, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v20, v0, s[4:5] +; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v26 +; SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8 -; SDAG-NEXT: s_mov_b64 s[12:13], 0 -; SDAG-NEXT: v_mov_b32_e32 v25, 0 -; SDAG-NEXT: v_mov_b32_e32 v26, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 -; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31 -; SDAG-NEXT: v_lshr_b64 v[36:37], v[2:3], v36 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v22, v22, v35 -; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7] +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24 -; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v35, 31, v17 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v24, v26, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v34 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v35 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v21 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0 -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21 -; SDAG-NEXT: v_and_b32_e32 v25, v21, v8 -; SDAG-NEXT: v_and_b32_e32 v26, v21, v9 -; SDAG-NEXT: v_and_b32_e32 v34, v21, v10 -; SDAG-NEXT: v_and_b32_e32 v35, v21, v11 -; SDAG-NEXT: v_and_b32_e32 v21, 1, v21 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v35, vcc -; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v19 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v20 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v24, v20, v8 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v20, v9 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v24, v20, v10 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v24, v20, v11 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v24, vcc +; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 -; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] -; SDAG-NEXT: v_or_b32_e32 v17, v20, v17 -; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13] -; SDAG-NEXT: v_or_b32_e32 v16, v19, v16 -; SDAG-NEXT: v_mov_b32_e32 v26, v22 +; SDAG-NEXT: v_or_b32_e32 v24, v26, v28 +; SDAG-NEXT: v_or_b32_e32 v25, v27, v29 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_and_b32_e32 v20, 1, v20 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v34 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 ; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13] +; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SDAG-NEXT: s_cbranch_execnz .LBB1_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v16, v20, v1 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v19, v0 -; SDAG-NEXT: v_or_b32_e32 v19, v21, v2 +; SDAG-NEXT: v_or_b32_e32 v16, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v21, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v22, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v20, v2 ; SDAG-NEXT: .LBB1_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 ; SDAG-NEXT: v_or_b32_e32 v0, v12, v14 ; SDAG-NEXT: v_or_b32_e32 v3, v5, v7 @@ -1045,20 +1046,20 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0 -; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc +; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0 +; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v24, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v3, v1, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v1, v3 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v10 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: v_and_b32_e32 v2, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -1069,118 +1070,118 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0 +; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0 +; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 +; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc +; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v22, v24 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 +; SDAG-NEXT: v_or_b32_e32 v11, v23, v25 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27 +; SDAG-NEXT: v_or_b32_e32 v1, v1, v11 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v10 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; SDAG-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 -; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc -; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v8, v24 -; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0x7f, v0 -; SDAG-NEXT: v_or_b32_e32 v2, v11, v25 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v3 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v3 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v3 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[1:2] -; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0 -; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 -; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8 -; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 -; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v8 +; SDAG-NEXT: v_lshr_b64 v[2:3], v[4:5], v22 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v22 +; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v22 +; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v22 ; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27 ; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v21, v21, v32 -; SDAG-NEXT: v_or_b32_e32 v20, v20, v31 +; SDAG-NEXT: v_or_b32_e32 v3, v3, v32 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v8 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v7, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v6, v20, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: .LBB1_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v5 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v5 ; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v20 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v30 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 -; SDAG-NEXT: v_or_b32_e32 v3, v10, v3 -; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 -; SDAG-NEXT: v_or_b32_e32 v2, v9, v2 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v26, v4 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v27, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v28, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v29, v7, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v23, 31, v20 -; SDAG-NEXT: v_and_b32_e32 v20, 1, v23 -; SDAG-NEXT: v_and_b32_e32 v30, v23, v15 -; SDAG-NEXT: v_and_b32_e32 v31, v23, v14 -; SDAG-NEXT: v_and_b32_e32 v32, v23, v13 -; SDAG-NEXT: v_and_b32_e32 v23, v23, v12 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v23 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v32, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v31, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v30, vcc -; SDAG-NEXT: v_add_i32_e32 v8, vcc, -1, v8 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v2 +; SDAG-NEXT: v_or_b32_e32 v2, v4, v30 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v31 +; SDAG-NEXT: v_or_b32_e32 v1, v21, v1 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v26, v2 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v27, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v28, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v29, v7, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v4 +; SDAG-NEXT: v_and_b32_e32 v31, v30, v13 +; SDAG-NEXT: v_and_b32_e32 v4, v30, v12 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v2, v4 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v31, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 +; SDAG-NEXT: v_or_b32_e32 v0, v20, v0 +; SDAG-NEXT: v_and_b32_e32 v2, 1, v30 +; SDAG-NEXT: v_and_b32_e32 v11, v30, v15 +; SDAG-NEXT: v_and_b32_e32 v30, v30, v14 +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v30, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22 +; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc ; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc ; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc -; SDAG-NEXT: v_or_b32_e32 v31, v11, v25 -; SDAG-NEXT: v_or_b32_e32 v30, v8, v24 +; SDAG-NEXT: v_or_b32_e32 v31, v23, v25 +; SDAG-NEXT: v_or_b32_e32 v30, v22, v24 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 -; SDAG-NEXT: v_mov_b32_e32 v23, v21 -; SDAG-NEXT: v_mov_b32_e32 v22, v20 +; SDAG-NEXT: v_or_b32_e32 v8, v10, v8 +; SDAG-NEXT: v_mov_b32_e32 v11, v3 +; SDAG-NEXT: v_mov_b32_e32 v10, v2 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB1_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v4 -; SDAG-NEXT: v_or_b32_e32 v8, v10, v3 -; SDAG-NEXT: v_or_b32_e32 v10, v21, v1 -; SDAG-NEXT: v_or_b32_e32 v9, v9, v2 -; SDAG-NEXT: v_or_b32_e32 v11, v20, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[8:9], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v21, v1 +; SDAG-NEXT: v_or_b32_e32 v10, v3, v5 +; SDAG-NEXT: v_or_b32_e32 v9, v20, v0 +; SDAG-NEXT: v_or_b32_e32 v11, v2, v4 ; SDAG-NEXT: .LBB1_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v19 @@ -1198,7 +1199,6 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v16, v2 ; GISEL-NEXT: v_mov_b32_e32 v17, v3 -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v18, v0, v16 @@ -1209,20 +1209,21 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ffbh_u32_e32 v23, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v17 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v16 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v16 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v17 ; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23 ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 -; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28 ; GISEL-NEXT: v_min_u32_e32 v2, v20, v2 ; GISEL-NEXT: v_min_u32_e32 v3, v22, v3 ; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 -; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: v_min_u32_e32 v19, v29, v19 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2 @@ -1235,28 +1236,28 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 ; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v2, v20 ; GISEL-NEXT: v_or_b32_e32 v3, v23, v21 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v26, v18 -; GISEL-NEXT: v_and_b32_e32 v18, 1, v3 ; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_and_b32_e32 v3, 1, v3 +; GISEL-NEXT: v_and_b32_e32 v2, 1, v2 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v24, 1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 @@ -1560,12 +1561,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] @@ -1574,106 +1575,106 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 ; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc ; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v24, v1 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v10, vcc +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v3, v22, v24 +; SDAG-NEXT: v_min_u32_e32 v3, v20, v22 ; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v20, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v10, v31 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v30 +; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v22, v8, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v2 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 ; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 -; SDAG-NEXT: v_min_u32_e32 v10, v10, v20 +; SDAG-NEXT: v_min_u32_e32 v11, v11, v21 +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v11, v18 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v8, v20, v21 +; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11 +; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v22 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v10, v21, vcc -; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: v_or_b32_e32 v9, v11, v19 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v9, v19 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_and_b32_e32 v10, 1, v20 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v21, v20, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc ; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 -; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc ; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 ; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10 ; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], v24 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24 ; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 -; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v32 ; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 @@ -1686,73 +1687,73 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26 ; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v11, v27 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v26 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v27 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v26 ; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v49, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v48, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v49, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v48, v8, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v25, v11, v17, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v24, v10, v16, vcc -; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v25, v9, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v24, v8, v16, vcc +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v25 +; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v26, v26, v10 -; SDAG-NEXT: v_or_b32_e32 v24, v24, v48 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v49 -; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v24 -; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v25, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v26, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v27, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v10, 31, v10 -; SDAG-NEXT: v_and_b32_e32 v48, v10, v31 -; SDAG-NEXT: v_and_b32_e32 v49, v10, v30 -; SDAG-NEXT: v_and_b32_e32 v50, v10, v2 -; SDAG-NEXT: v_and_b32_e32 v51, v10, v3 -; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v48 -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v50, vcc -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v51, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_or_b32_e32 v22, v26, v48 +; SDAG-NEXT: v_or_b32_e32 v23, v24, v49 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v39, v27, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; SDAG-NEXT: v_and_b32_e32 v24, v8, v31 +; SDAG-NEXT: v_and_b32_e32 v26, v8, v30 +; SDAG-NEXT: v_and_b32_e32 v48, v8, v2 +; SDAG-NEXT: v_and_b32_e32 v49, v8, v3 +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v22, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v49, vcc ; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc -; SDAG-NEXT: v_or_b32_e32 v48, v32, v34 -; SDAG-NEXT: v_or_b32_e32 v49, v33, v35 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49] -; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_or_b32_e32 v22, v32, v34 +; SDAG-NEXT: v_or_b32_e32 v23, v33, v35 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 -; SDAG-NEXT: v_mov_b32_e32 v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v22, v10 +; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 +; SDAG-NEXT: v_mov_b32_e32 v23, v9 +; SDAG-NEXT: v_mov_b32_e32 v22, v8 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v22 -; SDAG-NEXT: v_or_b32_e32 v35, v19, v9 -; SDAG-NEXT: v_or_b32_e32 v27, v11, v21 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v8 -; SDAG-NEXT: v_or_b32_e32 v33, v10, v20 +; SDAG-NEXT: v_or_b32_e32 v35, v19, v11 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 +; SDAG-NEXT: v_or_b32_e32 v27, v9, v21 +; SDAG-NEXT: v_or_b32_e32 v33, v8, v20 ; SDAG-NEXT: .LBB2_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 @@ -2025,28 +2026,28 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_srem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 -; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11 -; GISEL-NEXT: v_mov_b32_e32 v18, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v19, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v11 +; GISEL-NEXT: v_mov_b32_e32 v19, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v20, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v20 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v20 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v20 -; GISEL-NEXT: v_xor_b32_e32 v11, v11, v20 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v18 +; GISEL-NEXT: v_xor_b32_e32 v9, v9, v18 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v18 +; GISEL-NEXT: v_xor_b32_e32 v11, v11, v18 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28 ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc -; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v20 -; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v20, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v18 +; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v18, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc ; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc -; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v20, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v20, v29 +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v18, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v18, v29 ; GISEL-NEXT: v_ffbh_u32_e32 v21, v30 ; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 ; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 @@ -2055,53 +2056,53 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v2, v16, v8 ; GISEL-NEXT: v_or_b32_e32 v3, v17, v9 ; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 -; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 -; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 ; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v26, v9 -; GISEL-NEXT: v_ffbh_u32_e32 v27, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v9 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; GISEL-NEXT: v_min_u32_e32 v0, v20, v21 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25 -; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 -; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v27 +; GISEL-NEXT: v_min_u32_e32 v0, v18, v21 +; GISEL-NEXT: v_min_u32_e32 v1, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v24 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v26 +; GISEL-NEXT: v_min_u32_e32 v2, v25, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v27, v3 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 -; GISEL-NEXT: v_min_u32_e32 v1, v24, v1 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v26, v3 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v2 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[19:20] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 ; GISEL-NEXT: v_or_b32_e32 v19, v3, v1 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v19, v20, v21 -; GISEL-NEXT: v_and_b32_e32 v20, 1, v19 +; GISEL-NEXT: v_or_b32_e32 v19, v21, v20 ; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_and_b32_e32 v19, 1, v19 +; GISEL-NEXT: v_and_b32_e32 v18, 1, v18 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v20, 1, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 @@ -2154,11 +2155,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v31 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, v3, v17, vcc ; GISEL-NEXT: v_mov_b32_e32 v23, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 @@ -2166,40 +2167,40 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 -; GISEL-NEXT: v_lshl_b64 v[48:49], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v27 ; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 -; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v19 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v48, 31, v19 ; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 -; GISEL-NEXT: v_or_b32_e32 v3, v48, v25 -; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v2, v24, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v26, v48 ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v35, v3 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v49, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v27, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v31, v33 ; GISEL-NEXT: v_or_b32_e32 v1, v32, v34 ; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v37, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v27, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v25, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 ; GISEL-NEXT: v_and_b32_e32 v1, v0, v30 -; GISEL-NEXT: v_and_b32_e32 v25, v0, v29 -; GISEL-NEXT: v_and_b32_e32 v26, v0, v10 -; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 -; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v24, v0, v29 +; GISEL-NEXT: v_and_b32_e32 v48, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v49, v0, v11 +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v2, v48, vcc +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v39 ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -2486,11 +2487,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 ; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 ; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 ; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 ; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 ; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 @@ -2501,65 +2502,65 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 ; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v23, v22, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB3_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16 -; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v17, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v18 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v19, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc -; SDAG-NEXT: v_or_b32_e32 v20, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 -; SDAG-NEXT: v_or_b32_e32 v21, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[2:3], v26 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v26 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27 -; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v19, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0x7f, v18 +; SDAG-NEXT: v_or_b32_e32 v20, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v21 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v21 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v21 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v18 +; SDAG-NEXT: v_or_b32_e32 v19, v25, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v24, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 +; SDAG-NEXT: v_lshr_b64 v[16:17], v[0:1], v30 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30 @@ -2572,73 +2573,73 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v28 ; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v29 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v28 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v38, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v37, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v38, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v37, v16, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v17, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v16, v0, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 ; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v17 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v23 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_or_b32_e32 v28, v28, v18 -; SDAG-NEXT: v_or_b32_e32 v26, v26, v38 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v39 -; SDAG-NEXT: v_or_b32_e32 v17, v21, v17 -; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v34, v26 -; SDAG-NEXT: v_or_b32_e32 v16, v20, v16 -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v35, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v36, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v37, v29, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v18 -; SDAG-NEXT: v_and_b32_e32 v39, v38, v8 -; SDAG-NEXT: v_and_b32_e32 v48, v38, v9 -; SDAG-NEXT: v_and_b32_e32 v49, v38, v10 -; SDAG-NEXT: v_and_b32_e32 v18, 1, v38 -; SDAG-NEXT: v_and_b32_e32 v38, v38, v11 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v39 -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v38, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 +; SDAG-NEXT: v_or_b32_e32 v24, v28, v38 +; SDAG-NEXT: v_or_b32_e32 v25, v26, v39 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v16 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v25 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v26, v16, v8 +; SDAG-NEXT: v_and_b32_e32 v28, v16, v9 +; SDAG-NEXT: v_and_b32_e32 v38, v16, v10 +; SDAG-NEXT: v_and_b32_e32 v39, v16, v11 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v25, v26 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v24, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v39, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 -; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] -; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: v_or_b32_e32 v24, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v25, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_or_b32_e32 v19, v21, v19 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 -; SDAG-NEXT: v_mov_b32_e32 v25, v19 -; SDAG-NEXT: v_mov_b32_e32 v24, v18 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 +; SDAG-NEXT: v_mov_b32_e32 v25, v17 +; SDAG-NEXT: v_mov_b32_e32 v24, v16 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 -; SDAG-NEXT: v_or_b32_e32 v33, v21, v17 -; SDAG-NEXT: v_or_b32_e32 v30, v19, v23 -; SDAG-NEXT: v_or_b32_e32 v31, v20, v16 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v22 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 +; SDAG-NEXT: v_or_b32_e32 v33, v21, v19 +; SDAG-NEXT: v_or_b32_e32 v30, v17, v23 +; SDAG-NEXT: v_or_b32_e32 v31, v20, v18 +; SDAG-NEXT: v_or_b32_e32 v32, v16, v22 ; SDAG-NEXT: .LBB3_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 @@ -2678,63 +2679,63 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16 -; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc +; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v20, v20, v18 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v21, v17, v19 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_and_b32_e32 v20, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 ; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22 -; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v18, vcc -; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v34, v36 -; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16 -; SDAG-NEXT: v_or_b32_e32 v18, v35, v37 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v19 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v19 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18] -; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 -; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 -; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 -; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v34, v36 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v35, v37 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v27 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 ; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34 @@ -2747,100 +2748,100 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28 ; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39 ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v21, v21, v29 -; SDAG-NEXT: v_or_b32_e32 v20, v20, v28 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 ; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v50, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v49, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v50, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v49, v18, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v21, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v26, v20, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: .LBB3_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17 +; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v28, v28, v20 +; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v28, v18 ; SDAG-NEXT: v_or_b32_e32 v26, v26, v50 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v51 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v38, v26 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v39, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v48, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v49, v29, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v20 -; SDAG-NEXT: v_and_b32_e32 v20, 1, v25 -; SDAG-NEXT: v_and_b32_e32 v50, v25, v15 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v51 +; SDAG-NEXT: v_or_b32_e32 v17, v23, v17 +; SDAG-NEXT: v_or_b32_e32 v21, v25, v21 +; SDAG-NEXT: v_sub_i32_e32 v25, vcc, v38, v26 +; SDAG-NEXT: v_or_b32_e32 v16, v22, v16 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v39, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v49, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v25 +; SDAG-NEXT: v_and_b32_e32 v28, v25, v12 +; SDAG-NEXT: v_and_b32_e32 v50, v25, v13 ; SDAG-NEXT: v_and_b32_e32 v51, v25, v14 -; SDAG-NEXT: v_and_b32_e32 v52, v25, v13 -; SDAG-NEXT: v_and_b32_e32 v25, v25, v12 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v25 -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v51, vcc -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v50, vcc +; SDAG-NEXT: v_and_b32_e32 v52, v25, v15 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v28 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v50, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v18, v51, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v52, vcc ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc -; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 ; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 +; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v25 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 -; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: v_mov_b32_e32 v24, v18 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v21, v21, v17 -; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 -; SDAG-NEXT: v_or_b32_e32 v20, v20, v16 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v23, v17 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v21 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11 +; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 ; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 ; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 ; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v21, v14 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v18, v15 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v18, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v35, v19, v14 ; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 ; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0 -; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; SDAG-NEXT: v_mov_b32_e32 v18, v11 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19] +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v18, 0 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; SDAG-NEXT: v_mov_b32_e32 v20, v11 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v25, v34 +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v34 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28 ; SDAG-NEXT: v_mov_b32_e32 v28, v27 -; SDAG-NEXT: v_mov_b32_e32 v27, v19 +; SDAG-NEXT: v_mov_b32_e32 v27, v21 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v18, v35 -; SDAG-NEXT: v_mov_b32_e32 v18, v15 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v20, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v20, v35 +; SDAG-NEXT: v_mov_b32_e32 v20, v15 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v18, v[20:21] ; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17] ; SDAG-NEXT: v_mov_b32_e32 v8, v11 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v8 @@ -2849,24 +2850,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] ; SDAG-NEXT: v_mov_b32_e32 v22, v27 -; SDAG-NEXT: v_mov_b32_e32 v27, v19 -; SDAG-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v12, v21, v[26:27] +; SDAG-NEXT: v_mov_b32_e32 v27, v21 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v19, v[26:27] ; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v29, v16 ; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18] ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v11, v20 +; SDAG-NEXT: v_mov_b32_e32 v11, v21 ; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v22, v11 ; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v33, v16 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v17 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v21, v[11:12] +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v19, v[11:12] ; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 ; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v10 ; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v12, v17, vcc -; SDAG-NEXT: v_mov_b32_e32 v10, v19 +; SDAG-NEXT: v_mov_b32_e32 v10, v20 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc @@ -2876,7 +2877,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_urem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v18, v0, v2 @@ -2887,20 +2887,21 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v2 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v2 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v3 ; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23 ; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25 ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 -; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28 ; GISEL-NEXT: v_min_u32_e32 v16, v22, v16 ; GISEL-NEXT: v_min_u32_e32 v17, v24, v17 ; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 -; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: v_min_u32_e32 v19, v29, v19 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16 @@ -2913,28 +2914,28 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v23, 0x7f, v18 ; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v18 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v20, v20, v16 +; GISEL-NEXT: v_or_b32_e32 v20, v23, v16 ; GISEL-NEXT: v_or_b32_e32 v21, v19, v17 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v21, v22, v23 -; GISEL-NEXT: v_and_b32_e32 v22, 1, v21 ; GISEL-NEXT: v_or_b32_e32 v20, v21, v20 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: v_and_b32_e32 v21, 1, v21 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 @@ -2987,11 +2988,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v29, 0, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v17, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v25, 0 ; GISEL-NEXT: v_mov_b32_e32 v19, s7 ; GISEL-NEXT: v_mov_b32_e32 v18, s6 @@ -2999,40 +3000,40 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v16, s4 ; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v38, 31, v23 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v23 -; GISEL-NEXT: v_lshl_b64 v[38:39], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v29 ; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v27 -; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v21 -; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; GISEL-NEXT: v_or_b32_e32 v22, v16, v18 ; GISEL-NEXT: v_or_b32_e32 v23, v17, v19 -; GISEL-NEXT: v_or_b32_e32 v18, v28, v26 -; GISEL-NEXT: v_or_b32_e32 v19, v38, v27 -; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 +; GISEL-NEXT: v_or_b32_e32 v18, v26, v24 +; GISEL-NEXT: v_or_b32_e32 v19, v28, v39 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v34, v19 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v39, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v29, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v30, v32 ; GISEL-NEXT: v_or_b32_e32 v17, v31, v33 ; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v29, vcc +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v27, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v24 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 ; GISEL-NEXT: v_and_b32_e32 v17, v16, v8 -; GISEL-NEXT: v_and_b32_e32 v27, v16, v9 -; GISEL-NEXT: v_and_b32_e32 v28, v16, v10 -; GISEL-NEXT: v_and_b32_e32 v16, v16, v11 -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17 -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc -; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc -; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc +; GISEL-NEXT: v_and_b32_e32 v26, v16, v9 +; GISEL-NEXT: v_and_b32_e32 v39, v16, v10 +; GISEL-NEXT: v_and_b32_e32 v48, v16, v11 +; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v26, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v18, v39, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v20, v38 ; GISEL-NEXT: v_mov_b32_e32 v16, v24 ; GISEL-NEXT: v_mov_b32_e32 v17, v25 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index c3c1540383ec63..a4425666765618 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -694,18 +694,14 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 -; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; SI-NEXT: v_mov_b32_e32 v9, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 -; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 -; SI-NEXT: v_mov_b32_e32 v10, v6 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -721,61 +717,64 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 1, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 +; SI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 9 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 10 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 11 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 12 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 13 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 14 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 15 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_short v0, v[8:9], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 3199b76d279fab..3c70883f09d2c1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3030,50 +3030,50 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v32f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v20 -; VI-NEXT: v_max_f16_sdwa v20, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v0, v19 +; VI-NEXT: v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v20 -; VI-NEXT: v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v1, v19 +; VI-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v20 -; VI-NEXT: v_max_f16_sdwa v20, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v19 +; VI-NEXT: v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v20 -; VI-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v3, v19 +; VI-NEXT: v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v20 -; VI-NEXT: v_max_f16_sdwa v20, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v4, v4, v19 +; VI-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v20 -; VI-NEXT: v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v5, v5, v19 +; VI-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v6, v6, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v20 -; VI-NEXT: v_max_f16_sdwa v20, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v6, v6, v19 +; VI-NEXT: v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v7, v7, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v20 -; VI-NEXT: v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v7, v7, v19 +; VI-NEXT: v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v8, v8, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v20 -; VI-NEXT: v_max_f16_sdwa v20, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v8, v8, v19 +; VI-NEXT: v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v9, v9, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v20 -; VI-NEXT: v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v9, v9, v19 +; VI-NEXT: v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v10, v10, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v19 +; VI-NEXT: v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v11, v11, v11 ; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v11, v11, v19 ; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v10, v10, v20 -; VI-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v15, v15, v15 ; VI-NEXT: v_max_f16_e32 v14, v14, v14 ; VI-NEXT: v_max_f16_e32 v13, v13, v13 ; VI-NEXT: v_max_f16_e32 v12, v12, v12 -; VI-NEXT: v_max_f16_e32 v11, v11, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v20 ; VI-NEXT: v_or_b32_e32 v12, v12, v19 ; VI-NEXT: v_or_b32_e32 v13, v13, v18 ; VI-NEXT: v_or_b32_e32 v14, v14, v17 @@ -3342,11 +3342,11 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v64f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -3358,7 +3358,7 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -3370,341 +3370,344 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v28 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v4, v5, v4 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; CI-NEXT: v_or_b32_e32 v5, v7, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; CI-NEXT: v_or_b32_e32 v6, v7, v6 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; CI-NEXT: v_or_b32_e32 v7, v9, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x7c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_or_b32_e32 v7, v8, v7 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v18 ; CI-NEXT: v_or_b32_e32 v8, v10, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; CI-NEXT: v_or_b32_e32 v9, v11, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_or_b32_e32 v9, v10, v9 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v25 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; CI-NEXT: v_or_b32_e32 v10, v12, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_or_b32_e32 v10, v14, v10 +; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; CI-NEXT: v_or_b32_e32 v17, v18, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v30 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_or_b32_e32 v11, v13, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_or_b32_e32 v12, v15, v12 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v31 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v33 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_or_b32_e32 v13, v16, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; CI-NEXT: v_or_b32_e32 v15, v25, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v21 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; CI-NEXT: v_or_b32_e32 v16, v24, v25 -; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; CI-NEXT: v_or_b32_e32 v25, v28, v24 -; CI-NEXT: s_waitcnt vmcnt(9) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; CI-NEXT: v_or_b32_e32 v20, v19, v20 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_or_b32_e32 v19, v20, v19 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 +; CI-NEXT: v_or_b32_e32 v20, v22, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v34 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v17, v17, v26 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0 -; CI-NEXT: v_or_b32_e32 v18, v27, v18 -; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 -; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0 -; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: s_waitcnt vmcnt(12) -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; CI-NEXT: v_or_b32_e32 v20, v21, v20 -; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; CI-NEXT: s_waitcnt vmcnt(13) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: s_waitcnt vmcnt(12) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v24 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; CI-NEXT: v_or_b32_e32 v20, v23, v20 -; CI-NEXT: s_waitcnt vmcnt(9) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_or_b32_e32 v23, v27, v23 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 -; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v17, v17, v18 -; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0 -; CI-NEXT: v_or_b32_e32 v25, v25, v26 -; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0 ; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_or_b32_e32 v19, v24, v19 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_or_b32_e32 v21, v22, v21 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; CI-NEXT: v_or_b32_e32 v22, v23, v27 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; CI-NEXT: v_or_b32_e32 v23, v28, v23 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v24, v24, v27 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v27, v28, v27 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; CI-NEXT: v_or_b32_e32 v28, v29, v28 -; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 -; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 -; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0 -; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 -; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 -; CI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; CI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 -; CI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; CI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; CI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x64, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_or_b32_e32 v14, v15, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_or_b32_e32 v12, v12, v15 +; CI-NEXT: v_or_b32_e32 v11, v16, v11 +; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 +; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; CI-NEXT: buffer_store_dword v21, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v20, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; CI-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; CI-NEXT: buffer_store_dword v17, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; CI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; CI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index a0fe9d88e31cf9..3a7f3e41002d28 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -172,52 +172,52 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -331,34 +331,34 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB0_9: ; %Flow3 @@ -540,52 +540,52 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -699,34 +699,34 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB1_9: ; %Flow3 @@ -900,52 +900,52 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v5 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -1054,34 +1054,34 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB2_9: ; %Flow3 @@ -1255,52 +1255,52 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v5 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -1409,34 +1409,34 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB3_9: ; %Flow3 @@ -1786,34 +1786,34 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB6_9: ; %Flow3 @@ -2135,34 +2135,34 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB7_9: ; %Flow3 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 4f3086a9eb1f9a..34ee90c68569fd 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1209,50 +1209,50 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] +; SDAG-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] -; SDAG-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] -; SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[6:7] +; SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] +; SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 ; SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9] ; SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 -; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[10:11] -; SDAG-NEXT: v_mul_f64 v[10:11], v[10:11], 0.5 -; SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5 -; SDAG-NEXT: v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5 -; SDAG-NEXT: v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5 +; SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5 +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13] +; SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5 +; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] ; SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] -; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] -; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] -; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] -; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] -; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] -; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] -; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] -; SDAG-NEXT: v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13] -; SDAG-NEXT: v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15] -; SDAG-NEXT: v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17] -; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] -; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] -; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; SDAG-NEXT: v_mov_b32_e32 v12, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v13, 0x260 -; SDAG-NEXT: v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15] -; SDAG-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] -; SDAG-NEXT: v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] -; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 -; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 -; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 -; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 -; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15] +; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17] +; SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v15, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 +; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 ; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] @@ -1266,61 +1266,61 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_mov_b32_e32 v6, s4 ; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; GISEL-NEXT: v_mov_b32_e32 v6, s4 ; GISEL-NEXT: v_mov_b32_e32 v7, s5 ; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] ; GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 +; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] -; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] -; GISEL-NEXT: v_mul_f64 v[12:13], v[6:7], 0.5 -; GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] -; GISEL-NEXT: v_mul_f64 v[14:15], v[8:9], 0.5 -; GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9] -; GISEL-NEXT: v_mul_f64 v[16:17], v[10:11], 0.5 -; GISEL-NEXT: v_mul_f64 v[10:11], v[4:5], v[10:11] -; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5 -; GISEL-NEXT: v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5 -; GISEL-NEXT: v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5 -; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] +; GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5 +; GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9] +; GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5 +; GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11] +; GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5 +; GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5 +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5 +; GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5 ; GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] -; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] -; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] -; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] -; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] -; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] -; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] -; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] -; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] -; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] -; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] -; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] -; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] -; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] -; GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 -; GISEL-NEXT: v_mov_b32_e32 v13, 0x260 -; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc -; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] -; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 -; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 -; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 -; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 -; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 -; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] +; GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3] +; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] +; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9] +; GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13] +; GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 +; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3b2f15c8340a63..78e521aba120e9 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -671,17 +671,17 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 { ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: s_waitcnt vmcnt(6) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -692,17 +692,17 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 { ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -713,19 +713,19 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 { ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1388,137 +1388,137 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; CI-LABEL: void_func_v32i8: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v2, v3, v2 -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; CI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; CI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; CI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; CI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; CI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; CI-NEXT: v_or_b32_e32 v5, v5, v6 -; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_or_b32_e32 v8, v8, v9 ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; CI-NEXT: v_or_b32_e32 v12, v12, v13 -; CI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; CI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; CI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; CI-NEXT: v_and_b32_e32 v9, 0xff, v14 ; CI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; CI-NEXT: v_or_b32_e32 v7, v4, v5 +; CI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; CI-NEXT: v_or_b32_e32 v12, v12, v13 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v28 -; CI-NEXT: v_and_b32_e32 v6, 0xff, v26 -; CI-NEXT: v_or_b32_e32 v8, v8, v9 -; CI-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 ; CI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; CI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; CI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; CI-NEXT: v_or_b32_e32 v1, v4, v1 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v30 -; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v27 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v29 +; CI-NEXT: v_and_b32_e32 v14, 0xff, v28 +; CI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; CI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; CI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v1, v9 +; CI-NEXT: v_or_b32_e32 v9, v11, v10 +; CI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; CI-NEXT: v_or_b32_e32 v6, v7, v6 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_lshlrev_b32_e32 v15, 24, v27 +; CI-NEXT: v_and_b32_e32 v27, 0xff, v30 +; CI-NEXT: v_or_b32_e32 v13, v14, v13 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; CI-NEXT: v_or_b32_e32 v7, v3, v2 +; CI-NEXT: v_or_b32_e32 v3, v10, v1 +; CI-NEXT: v_or_b32_e32 v1, v4, v6 +; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; CI-NEXT: v_or_b32_e32 v11, v15, v14 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v9, v9, v13 -; CI-NEXT: v_or_b32_e32 v10, v11, v10 -; CI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; CI-NEXT: v_and_b32_e32 v12, 0xffff, v13 ; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v5, v5, v6 -; CI-NEXT: v_or_b32_e32 v6, v0, v2 -; CI-NEXT: v_or_b32_e32 v9, v11, v9 -; CI-NEXT: v_or_b32_e32 v8, v8, v10 -; CI-NEXT: v_lshlrev_b32_e32 v10, 8, v25 -; CI-NEXT: v_and_b32_e32 v11, 0xff, v24 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v7 +; CI-NEXT: v_or_b32_e32 v2, v8, v9 +; CI-NEXT: v_and_b32_e32 v8, 0xff, v20 +; CI-NEXT: v_and_b32_e32 v9, 0xff, v16 ; CI-NEXT: s_mov_b32 s5, 0 ; CI-NEXT: s_mov_b32 s4, 16 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; CI-NEXT: v_or_b32_e32 v0, v0, v4 -; CI-NEXT: v_or_b32_e32 v3, v1, v0 -; CI-NEXT: v_or_b32_e32 v0, v11, v10 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; CI-NEXT: v_or_b32_e32 v2, v0, v5 -; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v23 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v20 -; CI-NEXT: v_or_b32_e32 v1, v4, v1 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; CI-NEXT: v_or_b32_e32 v1, v1, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v0, v0, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v17 -; CI-NEXT: v_and_b32_e32 v5, 0xff, v16 -; CI-NEXT: v_or_b32_e32 v4, v5, v4 -; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; CI-NEXT: v_or_b32_e32 v0, v4, v0 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; CI-NEXT: v_or_b32_e32 v5, v24, v25 +; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; CI-NEXT: v_or_b32_e32 v4, v4, v26 +; CI-NEXT: v_or_b32_e32 v6, v5, v11 +; CI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; CI-NEXT: v_or_b32_e32 v7, v12, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_or_b32_e32 v4, v4, v5 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; CI-NEXT: v_or_b32_e32 v5, v8, v5 +; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; CI-NEXT: v_and_b32_e32 v8, 0xff, v18 +; CI-NEXT: v_or_b32_e32 v5, v5, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v19 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v4, v4, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; CI-NEXT: v_or_b32_e32 v8, v9, v8 +; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; CI-NEXT: v_or_b32_e32 v4, v8, v4 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s4, s5 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: void_func_v32i8: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 ; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX89-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v15 -; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 ; GFX89-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX89-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; GFX89-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX89-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; GFX89-NEXT: v_lshlrev_b16_e32 v14, 8, v25 +; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v27 +; GFX89-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX89-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX89-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX89-NEXT: v_lshlrev_b16_e32 v19, 8, v19 ; GFX89-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v5, 8, v7 -; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX89-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11 -; GFX89-NEXT: v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v25 -; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX89-NEXT: v_lshlrev_b16_e32 v2, 8, v23 -; GFX89-NEXT: v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX89-NEXT: v_lshlrev_b16_e32 v3, 8, v17 -; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v19 -; GFX89-NEXT: v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v11, v24, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v14, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: s_mov_b32 s5, 0 ; GFX89-NEXT: s_mov_b32 s4, 16 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v6, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v5, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v4, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v6, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v5, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v14 +; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v10 ; GFX89-NEXT: v_or_b32_sdwa v8, v30, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 @@ -2622,102 +2622,37 @@ define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, pt } define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 { -; CI-LABEL: void_func_v32i32_i32_i64: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_i32_i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_i32_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v32i32_i32_i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(3) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dword v34, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i32_i64: ; GFX11: ; %bb.0: @@ -2765,129 +2700,86 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(5) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_mul_f32_e32 v12, 1.0, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v33 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; CI-NEXT: v_and_b32_e32 v0, 1, v17 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; CI-NEXT: v_and_b32_e32 v0, 1, v34 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 ; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v13, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: void_func_v32i32_i1_i8_i16_bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 1, v20 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v17, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v18, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v19, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_i1_i8_i16_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v19, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: void_func_v32i32_i1_i8_i16_bf16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX89-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:4 +; GFX89-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX89-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX89-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; GFX89-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:20 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt vmcnt(5) +; GFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_and_b32_e32 v0, 1, v32 +; GFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v34, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v35, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v36, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16: ; GFX11: ; %bb.0: @@ -2945,105 +2837,38 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg } define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { -; CI-LABEL: void_func_v32i32_v2i32_v2f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_v2i32_v2f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v2i32_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v32i32_v2i32_v2f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(4) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[34:35], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32: ; GFX11: ; %bb.0: @@ -3093,54 +2918,54 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v10, v38 +; CI-NEXT: v_mul_f32_e32 v4, 1.0, v32 +; CI-NEXT: v_mul_f32_e32 v5, 1.0, v33 +; CI-NEXT: v_mul_f32_e32 v6, 1.0, v34 +; CI-NEXT: v_mul_f32_e32 v7, 1.0, v35 +; CI-NEXT: v_mul_f32_e32 v8, 1.0, v36 +; CI-NEXT: v_mul_f32_e32 v9, 1.0, v37 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v15, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v8, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_mul_f32_e32 v9, 1.0, v20 -; CI-NEXT: v_mul_f32_e32 v10, 1.0, v16 -; CI-NEXT: v_mul_f32_e32 v11, 1.0, v17 -; CI-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; CI-NEXT: v_mul_f32_e32 v17, 1.0, v19 -; CI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v17 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; CI-NEXT: buffer_store_short v14, off, s[4:7], 0 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; CI-NEXT: buffer_store_short v11, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v10, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v5, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3156,82 +2981,43 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v19, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v18, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v19, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; GFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; GFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt vmcnt(5) +; GFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dword v34, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dword v35, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dword v36, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: ; GFX11: ; %bb.0: @@ -3277,284 +3063,132 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i16> %arg1, ptr addrspace(1) undef - store volatile <2 x half> %arg2, ptr addrspace(1) undef - store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef - store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef - ret void -} - -define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { -; CI-LABEL: void_func_v32i32_v2i64_v2f64: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_v2i64_v2f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v2i64_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_v32i32_v2i64_v2f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - store volatile <32 x i32> %arg0, ptr addrspace(1) undef - store volatile <2 x i64> %arg1, ptr addrspace(1) undef - store volatile <2 x double> %arg2, ptr addrspace(1) undef - ret void -} - -define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { -; CI-LABEL: void_func_v32i32_v4i32_v4f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_v4i32_v4f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v4i32_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] + store volatile <2 x half> %arg2, ptr addrspace(1) undef + store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef + store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef + ret void +} + +define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { +; CIGFX89-LABEL: void_func_v32i32_v2i64_v2f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(8) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i64_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + store volatile <32 x i32> %arg0, ptr addrspace(1) undef + store volatile <2 x i64> %arg1, ptr addrspace(1) undef + store volatile <2 x double> %arg2, ptr addrspace(1) undef + ret void +} + +define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { +; CIGFX89-LABEL: void_func_v32i32_v4i32_v4f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(8) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v4i32_v4f32: ; GFX11: ; %bb.0: @@ -3608,7 +3242,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3617,37 +3258,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3657,7 +3291,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3666,37 +3307,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3706,7 +3340,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3715,41 +3356,31 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3817,9 +3448,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3828,61 +3466,54 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3890,9 +3521,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3901,61 +3539,54 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,9 +3594,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3974,69 +3612,57 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4323,7 +3949,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -4332,61 +3965,54 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -4396,7 +4022,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -4405,61 +4038,54 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4469,7 +4095,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -4478,65 +4111,56 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 401cbce00ac9a8..ac9f56d1ee7b15 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1497,8 +1497,8 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 @@ -1519,13 +1519,13 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 @@ -1780,8 +1780,8 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 @@ -1802,13 +1802,13 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 @@ -2063,8 +2063,8 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 @@ -2085,13 +2085,13 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:204 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192 ; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:188 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:180 @@ -2616,21 +2616,21 @@ define <32 x bfloat> @v32bf16_func_void() #0 { ; CI-NEXT: v_mov_b32_e32 v9, v1 ; CI-NEXT: v_mov_b32_e32 v10, v2 ; CI-NEXT: v_mov_b32_e32 v11, v3 -; CI-NEXT: v_mov_b32_e32 v12, v4 -; CI-NEXT: v_mov_b32_e32 v13, v5 -; CI-NEXT: v_mov_b32_e32 v14, v6 ; CI-NEXT: v_mov_b32_e32 v16, v0 ; CI-NEXT: v_mov_b32_e32 v17, v1 ; CI-NEXT: v_mov_b32_e32 v18, v2 ; CI-NEXT: v_mov_b32_e32 v19, v3 -; CI-NEXT: v_mov_b32_e32 v20, v4 -; CI-NEXT: v_mov_b32_e32 v21, v5 ; CI-NEXT: v_mov_b32_e32 v24, v0 ; CI-NEXT: v_mov_b32_e32 v25, v1 ; CI-NEXT: v_mov_b32_e32 v26, v2 ; CI-NEXT: v_mov_b32_e32 v27, v3 +; CI-NEXT: v_mov_b32_e32 v12, v4 +; CI-NEXT: v_mov_b32_e32 v20, v4 ; CI-NEXT: v_mov_b32_e32 v28, v4 +; CI-NEXT: v_mov_b32_e32 v13, v5 +; CI-NEXT: v_mov_b32_e32 v21, v5 ; CI-NEXT: v_mov_b32_e32 v29, v5 +; CI-NEXT: v_mov_b32_e32 v14, v6 ; CI-NEXT: v_mov_b32_e32 v22, v6 ; CI-NEXT: v_mov_b32_e32 v30, v6 ; CI-NEXT: v_mov_b32_e32 v15, v7 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 545a9af3f9a0bd..5ccbc85f46dd40 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -5227,19 +5227,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v4, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 7d07641f455e3f..c3ab9c23d1950b 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2379,140 +2379,128 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-LABEL: return_72xi32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX10-NEXT: s_clause 0x14 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116 -; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:112 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:108 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:136 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:144 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 -; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96 -; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:92 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:88 -; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:84 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:80 -; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:76 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:72 -; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:68 -; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:64 -; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:60 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 -; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:52 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 -; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 +; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:284 -; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:280 -; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:276 -; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:272 -; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:268 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:264 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:256 -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252 -; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:240 -; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:220 -; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:216 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:212 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:208 -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:188 -; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:184 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:180 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:176 -; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:172 -; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:168 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:164 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:160 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:284 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:280 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:276 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:272 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:268 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:264 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:260 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:256 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:252 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:236 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:216 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:204 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:184 +; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:180 +; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:176 +; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:172 +; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:168 +; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:156 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:128 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:124 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: return_72xi32: diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index fbb54893d9b2ac..a2fca33af10464 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1253,57 +1253,57 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; CI-NEXT: v_mov_b32_e32 v7, s3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; CI-NEXT: v_mov_b32_e32 v6, s2 -; CI-NEXT: s_add_u32 s2, s0, 48 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; CI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v1 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_mov_b32_e32 v21, s3 -; CI-NEXT: v_mov_b32_e32 v23, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_mov_b32_e32 v20, s2 -; CI-NEXT: v_mov_b32_e32 v22, s0 -; CI-NEXT: flat_store_dwordx4 v[6:7], v[16:19] +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; CI-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: @@ -1312,26 +1312,24 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v23, s3 -; VI-NEXT: v_mov_b32_e32 v22, s2 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: v_mov_b32_e32 v21, s1 +; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v20, s0 +; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v25, s3 -; VI-NEXT: v_mov_b32_e32 v27, s1 -; VI-NEXT: v_mov_b32_e32 v24, s2 -; VI-NEXT: v_mov_b32_e32 v26, s0 +; VI-NEXT: v_mov_b32_e32 v21, s3 +; VI-NEXT: v_mov_b32_e32 v20, s2 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 @@ -1341,19 +1339,21 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; VI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v19, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[22:23], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; VI-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: @@ -1665,43 +1665,43 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v7, s3 +; CI-NEXT: v_mov_b32_e32 v6, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 -; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v15, s3 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v21, s3 -; CI-NEXT: v_mov_b32_e32 v23, s1 -; CI-NEXT: v_mov_b32_e32 v20, s2 -; CI-NEXT: v_mov_b32_e32 v22, s0 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v10 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; CI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v8f16_to_v8f64: @@ -1713,39 +1713,39 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: v_mov_b32_e32 v8, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v16, s0 +; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v21, s3 -; VI-NEXT: v_mov_b32_e32 v23, s1 -; VI-NEXT: v_mov_b32_e32 v20, s2 -; VI-NEXT: v_mov_b32_e32 v22, s0 +; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 -; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 +; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_mov_b32_e32 v16, s0 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: @@ -1794,92 +1794,91 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v16, s3 -; CI-NEXT: v_mov_b32_e32 v15, s2 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v18, s3 -; CI-NEXT: v_mov_b32_e32 v17, s2 +; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v18, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v12, s1 -; CI-NEXT: v_mov_b32_e32 v11, s0 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; CI-NEXT: v_mov_b32_e32 v16, s3 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_mov_b32_e32 v15, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x50 -; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; CI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] -; CI-NEXT: v_cvt_f32_f16_e32 v12, v18 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v17 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v20, s3 +; CI-NEXT: v_mov_b32_e32 v19, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v19, s2 +; CI-NEXT: v_mov_b32_e32 v18, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -1897,76 +1896,77 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v13, s3 -; VI-NEXT: v_mov_b32_e32 v12, s2 +; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: v_mov_b32_e32 v18, s3 +; VI-NEXT: v_mov_b32_e32 v17, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 +; VI-NEXT: v_mov_b32_e32 v12, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v17, s3 -; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: v_mov_b32_e32 v11, s0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: s_add_u32 s2, s0, 64 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v11, s1 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x70 -; VI-NEXT: v_mov_b32_e32 v10, s0 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 ; VI-NEXT: s_add_u32 s0, s0, 0x60 +; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; VI-NEXT: v_cvt_f32_f16_sdwa v23, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v21, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 -; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30 -; VI-NEXT: v_mov_b32_e32 v21, s3 -; VI-NEXT: v_mov_b32_e32 v23, s1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v27 -; VI-NEXT: v_mov_b32_e32 v20, s2 -; VI-NEXT: v_mov_b32_e32 v22, s0 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 +; VI-NEXT: v_mov_b32_e32 v20, s3 +; VI-NEXT: v_mov_b32_e32 v14, s1 +; VI-NEXT: v_mov_b32_e32 v19, s2 +; VI-NEXT: v_mov_b32_e32 v13, s0 +; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: @@ -2368,52 +2368,51 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v13, s3 -; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CI-NEXT: v_mov_b32_e32 v12, s2 +; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_or_b32_e32 v0, v0, v18 ; CI-NEXT: v_or_b32_e32 v3, v6, v2 -; CI-NEXT: v_or_b32_e32 v2, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_or_b32_e32 v1, v10, v4 -; CI-NEXT: v_or_b32_e32 v0, v8, v5 +; CI-NEXT: v_or_b32_e32 v2, v17, v7 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_or_b32_e32 v3, v14, v6 -; CI-NEXT: v_or_b32_e32 v2, v12, v7 +; CI-NEXT: v_or_b32_e32 v1, v10, v6 +; CI-NEXT: v_or_b32_e32 v0, v8, v7 +; CI-NEXT: v_or_b32_e32 v3, v14, v9 +; CI-NEXT: v_or_b32_e32 v2, v12, v11 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -2429,31 +2428,29 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: s_add_u32 s4, s2, 48 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v17, s3 -; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -2464,17 +2461,19 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v18 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 ; VI-NEXT: v_or_b32_e32 v3, v6, v7 -; VI-NEXT: v_or_b32_e32 v2, v4, v5 +; VI-NEXT: v_or_b32_e32 v2, v18, v17 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_or_b32_e32 v1, v10, v11 ; VI-NEXT: v_or_b32_e32 v0, v8, v9 ; VI-NEXT: v_or_b32_e32 v3, v14, v15 ; VI-NEXT: v_or_b32_e32 v2, v12, v13 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index add62a5c39cb14..b9d3763e7def10 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2678,7 +2678,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18 +; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 +; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 ; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 @@ -2686,8 +2687,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 +; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index e71c6cf71c8823..74020c43a3ca3f 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1111,16 +1111,13 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; GENERIC-LABEL: extract_neg_offset_sgpr_loaded: ; GENERIC: ; %bb.0: ; %entry ; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39 ; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 -; GENERIC-NEXT: s_or_b32 s4, s23, s51 -; GENERIC-NEXT: s_or_b32 s5, s22, s50 -; GENERIC-NEXT: s_or_b32 s6, s21, s49 -; GENERIC-NEXT: s_or_b32 s7, s20, s48 +; GENERIC-NEXT: s_or_b32 s6, s23, s51 +; GENERIC-NEXT: s_or_b32 s7, s22, s50 +; GENERIC-NEXT: s_or_b32 s21, s21, s49 +; GENERIC-NEXT: s_or_b32 s20, s20, s48 ; GENERIC-NEXT: s_or_b32 s19, s19, s47 ; GENERIC-NEXT: s_or_b32 s18, s18, s46 ; GENERIC-NEXT: s_or_b32 s17, s17, s45 @@ -1133,38 +1130,42 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; GENERIC-NEXT: s_or_b32 s10, s10, s38 ; GENERIC-NEXT: s_or_b32 s8, s8, s36 ; GENERIC-NEXT: s_or_b32 s9, s9, s37 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 -; GENERIC-NEXT: s_cselect_b32 s8, s9, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s9, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 2 -; GENERIC-NEXT: s_cselect_b32 s8, s10, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s10, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 3 -; GENERIC-NEXT: s_cselect_b32 s8, s11, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s11, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 4 -; GENERIC-NEXT: s_cselect_b32 s8, s12, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s12, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 5 -; GENERIC-NEXT: s_cselect_b32 s8, s13, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s13, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 6 -; GENERIC-NEXT: s_cselect_b32 s8, s14, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s14, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 7 -; GENERIC-NEXT: s_cselect_b32 s8, s15, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s15, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 8 -; GENERIC-NEXT: s_cselect_b32 s8, s16, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s16, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 9 -; GENERIC-NEXT: s_cselect_b32 s8, s17, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s17, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 10 -; GENERIC-NEXT: s_cselect_b32 s8, s18, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s18, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 11 -; GENERIC-NEXT: s_cselect_b32 s8, s19, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s19, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 12 -; GENERIC-NEXT: s_cselect_b32 s7, s7, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s20, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 13 -; GENERIC-NEXT: s_cselect_b32 s6, s6, s7 +; GENERIC-NEXT: s_cselect_b32 s4, s21, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 14 -; GENERIC-NEXT: s_cselect_b32 s5, s5, s6 +; GENERIC-NEXT: s_cselect_b32 s4, s7, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 15 -; GENERIC-NEXT: s_cselect_b32 s4, s4, s5 +; GENERIC-NEXT: s_cselect_b32 s4, s6, s4 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, s4 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GENERIC-NEXT: s_endpgm ; @@ -1278,9 +1279,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: s_or_b32 s8, s8, s36 -; SI-MOVREL-NEXT: s_or_b32 s5, s23, s51 -; SI-MOVREL-NEXT: s_or_b32 s6, s22, s50 -; SI-MOVREL-NEXT: s_or_b32 s7, s21, s49 +; SI-MOVREL-NEXT: s_or_b32 s6, s23, s51 +; SI-MOVREL-NEXT: s_or_b32 s7, s22, s50 +; SI-MOVREL-NEXT: s_or_b32 s21, s21, s49 ; SI-MOVREL-NEXT: s_or_b32 s20, s20, s48 ; SI-MOVREL-NEXT: s_or_b32 s19, s19, s47 ; SI-MOVREL-NEXT: s_or_b32 s18, s18, s46 @@ -1307,9 +1308,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 ; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 ; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s6 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s6 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm @@ -5699,94 +5700,94 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 ; GENERIC-NEXT: s_mov_b32 s27, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc +; GENERIC-NEXT: buffer_load_dword v14, v[1:2], s[24:27], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: ;;#ASMSTART ; GENERIC-NEXT: v_mov_b32 v1, 62 ; GENERIC-NEXT: ;;#ASMEND -; GENERIC-NEXT: v_mov_b32_e32 v3, s20 -; GENERIC-NEXT: v_mov_b32_e32 v4, s21 -; GENERIC-NEXT: v_mov_b32_e32 v5, s22 -; GENERIC-NEXT: v_mov_b32_e32 v6, s23 -; GENERIC-NEXT: v_mov_b32_e32 v7, s16 -; GENERIC-NEXT: v_mov_b32_e32 v8, s17 -; GENERIC-NEXT: v_mov_b32_e32 v9, s18 -; GENERIC-NEXT: v_mov_b32_e32 v10, s19 -; GENERIC-NEXT: v_mov_b32_e32 v11, s12 -; GENERIC-NEXT: v_mov_b32_e32 v12, s13 -; GENERIC-NEXT: v_mov_b32_e32 v13, s14 -; GENERIC-NEXT: v_mov_b32_e32 v14, s15 -; GENERIC-NEXT: v_mov_b32_e32 v15, s8 -; GENERIC-NEXT: v_mov_b32_e32 v16, s9 -; GENERIC-NEXT: v_mov_b32_e32 v17, s10 -; GENERIC-NEXT: v_mov_b32_e32 v18, s11 -; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; GENERIC-NEXT: v_mov_b32_e32 v10, s22 +; GENERIC-NEXT: v_mov_b32_e32 v11, s23 +; GENERIC-NEXT: v_mov_b32_e32 v15, s16 +; GENERIC-NEXT: v_mov_b32_e32 v2, s18 +; GENERIC-NEXT: v_mov_b32_e32 v3, s19 +; GENERIC-NEXT: v_mov_b32_e32 v4, s12 +; GENERIC-NEXT: v_mov_b32_e32 v5, s13 +; GENERIC-NEXT: v_mov_b32_e32 v6, s14 +; GENERIC-NEXT: v_mov_b32_e32 v7, s15 +; GENERIC-NEXT: v_mov_b32_e32 v8, s8 +; GENERIC-NEXT: v_mov_b32_e32 v9, s9 +; GENERIC-NEXT: v_mov_b32_e32 v12, s10 +; GENERIC-NEXT: v_mov_b32_e32 v13, s11 +; GENERIC-NEXT: v_add_i32_e32 v18, vcc, 1, v14 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 ; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; GENERIC-NEXT: v_mov_b32_e32 v16, s17 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; GENERIC-NEXT: v_mov_b32_e32 v15, s21 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; GENERIC-NEXT: v_mov_b32_e32 v19, s20 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; GENERIC-NEXT: s_waitcnt vmcnt(0) @@ -6257,97 +6258,98 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc +; SI-MOVREL-NEXT: buffer_load_dword v14, v[1:2], s[8:11], 0 addr64 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: ;;#ASMSTART ; SI-MOVREL-NEXT: v_mov_b32 v1, 62 ; SI-MOVREL-NEXT: ;;#ASMEND ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s21 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s22 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s23 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s16 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s17 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s19 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s11 -; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2 -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s22 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s23 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s16 +; SI-MOVREL-NEXT: v_add_i32_e32 v18, vcc, 1, v14 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s17 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s21 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; SI-MOVREL-NEXT: v_mov_b32_e32 v19, s20 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 @@ -6368,104 +6370,104 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: flat_load_dword v2, v[1:2] glc +; VI-NEXT: flat_load_dword v14, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: v_mov_b32 v1, 62 ; VI-NEXT: ;;#ASMEND +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v9, s9 +; VI-NEXT: v_mov_b32_e32 v12, s10 +; VI-NEXT: v_mov_b32_e32 v13, s11 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 1, v14 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; VI-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; VI-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; VI-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; VI-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; VI-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; VI-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 +; VI-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 +; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; VI-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; VI-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; VI-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; VI-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; VI-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; VI-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; VI-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; VI-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; VI-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 +; VI-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; VI-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s20 -; VI-NEXT: v_mov_b32_e32 v4, s21 -; VI-NEXT: v_mov_b32_e32 v5, s22 -; VI-NEXT: v_mov_b32_e32 v6, s23 -; VI-NEXT: v_mov_b32_e32 v7, s16 -; VI-NEXT: v_mov_b32_e32 v8, s17 -; VI-NEXT: v_mov_b32_e32 v9, s18 -; VI-NEXT: v_mov_b32_e32 v10, s19 -; VI-NEXT: v_mov_b32_e32 v11, s12 -; VI-NEXT: v_mov_b32_e32 v12, s13 -; VI-NEXT: v_mov_b32_e32 v13, s14 -; VI-NEXT: v_mov_b32_e32 v14, s15 -; VI-NEXT: v_mov_b32_e32 v15, s8 -; VI-NEXT: v_mov_b32_e32 v16, s9 -; VI-NEXT: v_mov_b32_e32 v17, s10 -; VI-NEXT: v_mov_b32_e32 v18, s11 ; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; VI-NEXT: v_mov_b32_e32 v19, s20 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 -; VI-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 -; VI-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 -; VI-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 -; VI-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 -; VI-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 -; VI-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 -; VI-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 -; VI-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; VI-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; VI-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; VI-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; VI-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; VI-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 -; VI-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 -; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 -; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 -; VI-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 -; VI-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 -; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 -; VI-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 -; VI-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 -; VI-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 -; VI-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 -; VI-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 -; VI-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 -; VI-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 -; VI-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; VI-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; VI-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6496,105 +6498,105 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GFX9-IDXMODE: ; %bb.0: ; %entry ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: global_load_dword v14, v1, s[0:1] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: ;;#ASMSTART ; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62 ; GFX9-IDXMODE-NEXT: ;;#ASMEND -; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v27, v10, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, v16, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 -; GFX9-IDXMODE-NEXT: v_add_u32_e32 v20, 1, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v18, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v19, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v3, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v20 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v18, 1, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v20 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v12, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v20 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v11, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v27, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v24, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s20 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:48 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] offset:32 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[7:10], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2 @@ -6629,132 +6631,134 @@ bb2: define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { ; GENERIC-LABEL: insert_w_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s2, -1 -; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41880000 -; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41400000 +; GENERIC-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s24, s[4:5], 0xb +; GENERIC-NEXT: s_mov_b32 s31, 0xf000 +; GENERIC-NEXT: s_mov_b32 s30, -1 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000 ; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000 ; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000 ; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000 ; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s5, s4, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s5, 12 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 13 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 14 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 15 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 8 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 9 -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 10 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 5 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, 1.0, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 1 -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, 2.0, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 3 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v16, 4.0, v8, vcc -; GENERIC-NEXT: s_add_i32 s4, s4, 2 -; GENERIC-NEXT: s_cmp_lg_u32 s4, 3 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 +; GENERIC-NEXT: s_add_i32 s25, s24, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 13 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 14 +; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 15 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 8 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 9 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 10 +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 11 +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 4 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 5 +; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 6 +; GENERIC-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 7 +; GENERIC-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 0 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, v0, s[22:23] +; GENERIC-NEXT: s_cmp_eq_u32 s25, 1 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v14, 2.0, v0, s[22:23] +; GENERIC-NEXT: s_cmp_eq_u32 s25, 2 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[22:23] +; GENERIC-NEXT: s_cmp_eq_u32 s25, 3 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v16, 4.0, v0, s[22:23] +; GENERIC-NEXT: s_add_i32 s26, s24, 2 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 3 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 2 +; GENERIC-NEXT: s_cselect_b64 s[24:25], -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v16, v8, v16, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 1 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 5 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 10 -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 9 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 8 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 15 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 13 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 12 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GENERIC-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[22:23] +; GENERIC-NEXT: v_cndmask_b32_e64 v15, v0, v15, s[24:25] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 1 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v14, v0, v14, s[22:23] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 0 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, v0, v13, s[22:23] +; GENERIC-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[14:15] +; GENERIC-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[16:17] +; GENERIC-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[18:19] +; GENERIC-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[20:21] +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:16 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 7 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 6 +; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e64 v12, v0, v12, s[14:15] +; GENERIC-NEXT: v_cndmask_b32_e64 v11, v0, v11, s[16:17] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 5 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[14:15] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 4 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v9, v0, v9, s[14:15] +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[2:3] +; GENERIC-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[6:7] +; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:48 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[10:11] +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[12:13] +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:32 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 11 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:80 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(1) +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 10 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:96 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:112 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 offset:64 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_w_offset_multiple_in_block: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 48a168b4bfbe71..d5b6c19399a1f8 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1314,108 +1314,108 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[0:1], s[14:15] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 -; SI-NEXT: s_cmp_eq_u32 s5, 6 -; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 +; SI-NEXT: s_cmp_eq_u32 s7, 6 +; SI-NEXT: v_mov_b32_e32 v6, s6 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_cmp_eq_u32 s7, 7 +; SI-NEXT: s_mov_b64 s[14:15], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 4 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 5 +; SI-NEXT: s_cmp_eq_u32 s7, 4 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 2 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 2 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 0 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[0:1] +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 0 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v13, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 1 +; SI-NEXT: v_cndmask_b32_e64 v12, v13, v6, s[2:3] +; SI-NEXT: s_cmp_eq_u32 s7, 1 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 14 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v14, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 14 +; SI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5] +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v14, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 15 +; SI-NEXT: s_cmp_eq_u32 s7, 15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 12 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v15, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 12 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v15, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 13 +; SI-NEXT: s_cmp_eq_u32 s7, 13 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 10 -; SI-NEXT: v_or_b32_e32 v3, v3, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 10 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v16, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 11 +; SI-NEXT: s_cmp_eq_u32 s7, 11 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v17, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: s_cmp_eq_u32 s7, 9 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[12:15], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -1429,81 +1429,81 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s5, 14 +; VI-NEXT: s_cmp_eq_u32 s7, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_mov_b32_e32 v12, s6 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 15 +; VI-NEXT: s_cmp_eq_u32 s7, 15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 12 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 13 +; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 13 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 10 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 11 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 11 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 8 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 9 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 9 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 7 +; VI-NEXT: s_cmp_eq_u32 s7, 7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 4 -; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 4 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 2 -; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 2 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 3 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -1542,16 +1542,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 0 -; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 14 -; GFX900-NEXT: v_perm_b32 v2, v10, v2, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 15 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1559,30 +1557,32 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 12 -; GFX900-NEXT: v_perm_b32 v1, v10, v1, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc +; GFX900-NEXT: v_perm_b32 v1, v12, v1, s2 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 13 ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 10 -; GFX900-NEXT: v_perm_b32 v8, v10, v8, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc +; GFX900-NEXT: v_perm_b32 v8, v12, v8, s2 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 11 -; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 8 -; GFX900-NEXT: v_perm_b32 v7, v10, v7, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 9 -; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_perm_b32 v2, v11, v2, s2 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX900-NEXT: v_perm_b32 v7, v12, v7, s2 ; GFX900-NEXT: v_perm_b32 v6, v10, v6, s2 ; GFX900-NEXT: v_perm_b32 v5, v9, v5, s2 ; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index d09af8fd2ac954..12b4b2b372ef8e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2794,16 +2794,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 0 -; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 14 -; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2811,30 +2809,32 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 12 -; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc +; GFX9-NEXT: v_perm_b32 v1, v12, v1, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 10 -; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc +; GFX9-NEXT: v_perm_b32 v8, v12, v8, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 11 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 8 -; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_perm_b32 v2, v11, v2, s2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 ; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 ; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 @@ -2844,7 +2844,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2858,81 +2858,81 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s5, 14 +; VI-NEXT: s_cmp_eq_u32 s7, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_mov_b32_e32 v12, s6 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 15 +; VI-NEXT: s_cmp_eq_u32 s7, 15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 12 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 13 +; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 13 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 10 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 11 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 11 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 8 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 9 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 9 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 7 +; VI-NEXT: s_cmp_eq_u32 s7, 7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 4 -; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 4 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 2 -; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 2 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 3 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -2965,101 +2965,101 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 10 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] ; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; CI-NEXT: v_or_b32_e32 v9, v9, v12 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: v_or_b32_e32 v8, v8, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 ; CI-NEXT: s_cmp_eq_u32 s5, 9 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 8 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 7 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_or_b32_e32 v10, v10, v11 -; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_or_b32_e32 v10, v10, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; CI-NEXT: v_or_b32_e32 v7, v7, v12 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v9, v9, v12 +; CI-NEXT: v_or_b32_e32 v3, v3, v12 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_or_b32_e32 v2, v2, v12 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 -; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 1 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; CI-NEXT: v_or_b32_e32 v8, v8, v13 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_or_b32_e32 v3, v3, v13 +; CI-NEXT: v_or_b32_e32 v1, v1, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 26a4ea9d8a4b6e..edf900a50cd4b4 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5413,33 +5413,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] ; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v8, v13, vcc +; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] @@ -5518,33 +5518,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] ; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v8, v13, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] @@ -5615,33 +5615,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v8, v13, vcc +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir index dd478f94e1039e..98552de05c8572 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -45,6 +45,10 @@ body: | ; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_15:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_16:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) @@ -64,14 +68,10 @@ body: | ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_14]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_15:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_15]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_16:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_16]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_13]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_14]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_15]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_16]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_17:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY17]], implicit $mode, implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_17]], implicit $exec ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 1d0367db701436..4532571d5cf2a1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -2059,207 +2059,207 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX7-LABEL: v_maximum_v16f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v0, v16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v16 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v23 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v16 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v24 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v18 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v16 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v25 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v17 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v16 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v26 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v17 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v28 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v16 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v17 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v21 -; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v28 -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v29 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 ; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v18 +; GFX7-NEXT: v_max_f32_e32 v12, v12, v18 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v29 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v17 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v22 +; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18 +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v13 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v18, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v17 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v23 +; GFX7-NEXT: v_max_f32_e32 v16, v18, v16 +; GFX7-NEXT: v_max_f32_e32 v18, v13, v0 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v15 +; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v30 +; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v17 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v24 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v13 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_mov_b32_e32 v19, 0x7fc00000 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v13, v19, v16, s[26:27] +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v17 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v25 +; GFX7-NEXT: v_max_f32_e32 v16, v14, v15 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v15 +; GFX7-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v19, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[8:9] +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v17 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v26 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v19, v7, s[14:15] +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v19, v8, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v19, v12, s[24:25] +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v17 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27 +; GFX7-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[20:21] +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v17 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v17 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v12, v20 -; GFX7-NEXT: v_max_f32_e32 v12, v12, v20 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc -; GFX7-NEXT: v_max_f32_e32 v20, v13, v19 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc -; GFX7-NEXT: v_max_f32_e32 v19, v14, v18 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[22:23] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v19, v18, s[28:29] +; GFX7-NEXT: v_max_f32_e32 v15, v20, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v20, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX8-NEXT: v_max_f16_e32 v18, v17, v16 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v17, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v18, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_max_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_max_f16_e32 v16, v18, v17 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX8-NEXT: v_max_f16_e32 v21, v20, v18 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v20, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX8-NEXT: v_max_f16_e32 v22, v21, v20 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v21, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v22, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX8-NEXT: v_max_f16_e32 v23, v22, v21 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v22, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v23, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX8-NEXT: v_max_f16_e32 v24, v23, v22 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v23, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v19, v24, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v25, v24, v23 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v24, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v19, v25, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v26, v25, v24 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v25, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v19, v26, vcc -; GFX8-NEXT: v_max_f16_e32 v25, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc -; GFX8-NEXT: v_max_f16_e32 v15, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v19, v15, vcc -; GFX8-NEXT: v_max_f16_e32 v14, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v19, v14, vcc -; GFX8-NEXT: v_max_f16_e32 v13, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v19, v13, vcc -; GFX8-NEXT: v_max_f16_e32 v12, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v12, vcc -; GFX8-NEXT: v_max_f16_e32 v11, v2, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX8-NEXT: v_max_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX8-NEXT: v_max_f16_e32 v21, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX8-NEXT: v_max_f16_e32 v22, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX8-NEXT: v_max_f16_e32 v23, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX8-NEXT: v_max_f16_e32 v24, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX8-NEXT: v_max_f16_e32 v25, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 +; GFX8-NEXT: v_max_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 +; GFX8-NEXT: v_max_f16_e32 v6, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 +; GFX8-NEXT: v_max_f16_e32 v4, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 +; GFX8-NEXT: v_max_f16_e32 v11, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 +; GFX8-NEXT: v_max_f16_e32 v13, v7, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v11, vcc -; GFX8-NEXT: v_max_f16_e32 v10, v1, v9 +; GFX8-NEXT: v_max_f16_e32 v14, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v10, vcc -; GFX8-NEXT: v_max_f16_e32 v9, v0, v8 +; GFX8-NEXT: v_max_f16_e32 v7, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v9, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index df7355c2c57bfa..584dd2700c419a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -1730,20 +1730,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_writelane_b32 v31, s30, 0 +; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v19, v0, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX7-NEXT: v_max_f32_e32 v16, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1752,39 +1752,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX7-NEXT: v_readlane_b32 s31, v31, 1 ; GFX7-NEXT: v_readlane_b32 s30, v31, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -1797,20 +1797,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_writelane_b32 v31, s30, 0 +; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX8-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX8-NEXT: v_max_f32_e32 v19, v0, v16 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX8-NEXT: v_max_f32_e32 v16, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1819,39 +1819,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX8-NEXT: v_readlane_b32 s31, v31, 1 ; GFX8-NEXT: v_readlane_b32 s30, v31, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -1864,20 +1864,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX900-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX900-NEXT: v_max_f32_e32 v19, v0, v16 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX900-NEXT: v_max_f32_e32 v16, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1886,39 +1886,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX900-NEXT: v_readlane_b32 s31, v31, 1 ; GFX900-NEXT: v_readlane_b32 s30, v31, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index f8c2c54af27830..0b9cb9682ea5f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -1598,87 +1598,87 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-LABEL: v_minimum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX8-NEXT: v_min_f16_e32 v18, v17, v16 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v17, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v18, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_min_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_min_f16_e32 v16, v18, v17 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX8-NEXT: v_min_f16_e32 v21, v20, v18 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v20, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX8-NEXT: v_min_f16_e32 v22, v21, v20 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v21, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v22, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX8-NEXT: v_min_f16_e32 v23, v22, v21 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v22, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v23, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX8-NEXT: v_min_f16_e32 v24, v23, v22 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v23, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v19, v24, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v25, v24, v23 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v24, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v19, v25, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v26, v25, v24 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v25, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v19, v26, vcc -; GFX8-NEXT: v_min_f16_e32 v25, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc -; GFX8-NEXT: v_min_f16_e32 v15, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v19, v15, vcc -; GFX8-NEXT: v_min_f16_e32 v14, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v19, v14, vcc -; GFX8-NEXT: v_min_f16_e32 v13, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v19, v13, vcc -; GFX8-NEXT: v_min_f16_e32 v12, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v12, vcc -; GFX8-NEXT: v_min_f16_e32 v11, v2, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX8-NEXT: v_min_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX8-NEXT: v_min_f16_e32 v21, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX8-NEXT: v_min_f16_e32 v22, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX8-NEXT: v_min_f16_e32 v23, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX8-NEXT: v_min_f16_e32 v24, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX8-NEXT: v_min_f16_e32 v25, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 +; GFX8-NEXT: v_min_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 +; GFX8-NEXT: v_min_f16_e32 v6, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 +; GFX8-NEXT: v_min_f16_e32 v4, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 +; GFX8-NEXT: v_min_f16_e32 v11, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 +; GFX8-NEXT: v_min_f16_e32 v13, v7, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_min_f16_e32 v3, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v11, vcc -; GFX8-NEXT: v_min_f16_e32 v10, v1, v9 +; GFX8-NEXT: v_min_f16_e32 v14, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v10, vcc -; GFX8-NEXT: v_min_f16_e32 v9, v0, v8 +; GFX8-NEXT: v_min_f16_e32 v7, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v9, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 956de6de3aad3b..99624331340730 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -1730,20 +1730,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_writelane_b32 v31, s30, 0 +; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v19, v0, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX7-NEXT: v_min_f32_e32 v16, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1752,39 +1752,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX7-NEXT: v_readlane_b32 s31, v31, 1 ; GFX7-NEXT: v_readlane_b32 s30, v31, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -1797,20 +1797,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_writelane_b32 v31, s30, 0 +; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX8-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX8-NEXT: v_min_f32_e32 v19, v0, v16 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX8-NEXT: v_min_f32_e32 v16, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1819,39 +1819,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX8-NEXT: v_readlane_b32 s31, v31, 1 ; GFX8-NEXT: v_readlane_b32 s30, v31, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -1864,20 +1864,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX900-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX900-NEXT: v_min_f32_e32 v19, v0, v16 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX900-NEXT: v_min_f32_e32 v16, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1886,39 +1886,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX900-NEXT: v_readlane_b32 s31, v31, 1 ; GFX900-NEXT: v_readlane_b32 s30, v31, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index c735854a455905..b378d69fb842ff 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -574,84 +574,85 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: s_brev_b32 s6, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 -; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec +; CI-NEXT: v_mov_b32_e32 v2, s7 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_bfi_b32 v5, s6, v2, v5 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v10, s9 ; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] ; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s6, v5, v12 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v14, s13 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v16, s19 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s6, v5, v16 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_mov_b32_e32 v18, s17 -; CI-NEXT: v_add_f64 v[10:11], s[22:23], -v[16:17] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v10, s17 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] ; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 ; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v20, s23 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_mov_b32_e32 v5, s2 +; CI-NEXT: v_mov_b32_e32 v18, s23 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v18 +; CI-NEXT: v_mov_b32_e32 v18, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v19, s21 ; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_mov_b32_e32 v18, s21 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_bfi_b32 v5, s6, v18, v19 ; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index c1ab63b8160c6a..223870950e4b78 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -1772,42 +1772,42 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v12, v[0:1] +; GFX8-NEXT: flat_load_ushort v18, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v12, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v3, v12, 3, 1 -; GFX8-NEXT: v_bfe_i32 v2, v12, 2, 1 -; GFX8-NEXT: v_bfe_i32 v1, v12, 1, 1 -; GFX8-NEXT: v_bfe_i32 v0, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v7, v12, 7, 1 -; GFX8-NEXT: v_bfe_i32 v6, v12, 6, 1 -; GFX8-NEXT: v_bfe_i32 v5, v12, 5, 1 -; GFX8-NEXT: v_bfe_i32 v4, v12, 4, 1 -; GFX8-NEXT: v_bfe_i32 v11, v12, 11, 1 -; GFX8-NEXT: v_bfe_i32 v10, v12, 10, 1 -; GFX8-NEXT: v_bfe_i32 v9, v12, 9, 1 -; GFX8-NEXT: v_bfe_i32 v8, v12, 8, 1 -; GFX8-NEXT: v_bfe_i32 v15, v12, 15, 1 -; GFX8-NEXT: v_bfe_i32 v14, v12, 14, 1 -; GFX8-NEXT: v_bfe_i32 v13, v12, 13, 1 -; GFX8-NEXT: v_bfe_i32 v12, v12, 12, 1 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX8-NEXT: v_bfe_i32 v7, v18, 15, 1 +; GFX8-NEXT: v_bfe_i32 v6, v18, 14, 1 +; GFX8-NEXT: v_bfe_i32 v5, v18, 13, 1 +; GFX8-NEXT: v_bfe_i32 v4, v18, 12, 1 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NEXT: v_bfe_i32 v11, v18, 11, 1 +; GFX8-NEXT: v_bfe_i32 v10, v18, 10, 1 +; GFX8-NEXT: v_bfe_i32 v9, v18, 9, 1 +; GFX8-NEXT: v_bfe_i32 v8, v18, 8, 1 +; GFX8-NEXT: v_bfe_i32 v3, v18, 3, 1 +; GFX8-NEXT: v_bfe_i32 v2, v18, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v18, 1, 1 +; GFX8-NEXT: v_bfe_i32 v0, v18, 0, 1 +; GFX8-NEXT: v_bfe_i32 v7, v18, 7, 1 +; GFX8-NEXT: v_bfe_i32 v6, v18, 6, 1 +; GFX8-NEXT: v_bfe_i32 v5, v18, 5, 1 +; GFX8-NEXT: v_bfe_i32 v4, v18, 4, 1 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i1_to_v16i32: @@ -2707,33 +2707,33 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000b ; GFX6-NEXT: s_bfe_u32 s9, s2, 0x10009 ; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s11, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10019 -; GFX6-NEXT: s_lshr_b32 s18, s2, 31 -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x1001d -; GFX6-NEXT: s_bfe_u32 s20, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s21, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s24, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s27, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s28, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s29, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s33, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10019 -; GFX6-NEXT: s_lshr_b32 s35, s3, 31 -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x1001d -; GFX6-NEXT: s_and_b32 s37, s2, 1 -; GFX6-NEXT: s_bfe_u32 s38, s2, 0x10002 +; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000d +; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10013 +; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10011 +; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10017 +; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10015 +; GFX6-NEXT: s_bfe_u32 s18, s2, 0x1001b +; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10019 +; GFX6-NEXT: s_lshr_b32 s20, s2, 31 +; GFX6-NEXT: s_bfe_u32 s21, s2, 0x1001d +; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10003 +; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10001 +; GFX6-NEXT: s_bfe_u32 s24, s3, 0x10007 +; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10005 +; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000b +; GFX6-NEXT: s_bfe_u32 s27, s3, 0x10009 +; GFX6-NEXT: s_bfe_u32 s28, s3, 0x1000f +; GFX6-NEXT: s_bfe_u32 s29, s3, 0x1000d +; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10013 +; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10011 +; GFX6-NEXT: s_bfe_u32 s33, s3, 0x10017 +; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10015 +; GFX6-NEXT: s_bfe_u32 s35, s3, 0x1001b +; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10019 +; GFX6-NEXT: s_lshr_b32 s37, s3, 31 +; GFX6-NEXT: s_bfe_u32 s38, s3, 0x1001d +; GFX6-NEXT: s_and_b32 s12, s2, 1 +; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10002 ; GFX6-NEXT: s_bfe_u32 s39, s2, 0x10006 ; GFX6-NEXT: s_bfe_u32 s40, s2, 0x10004 ; GFX6-NEXT: s_bfe_u32 s41, s2, 0x1000a @@ -2752,91 +2752,90 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002 ; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10006 ; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10004 -; GFX6-NEXT: s_bfe_u32 s57, s3, 0x1000a -; GFX6-NEXT: s_bfe_u32 s58, s3, 0x10008 -; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000e +; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008 +; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000e +; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c ; GFX6-NEXT: s_bfe_u32 s60, s3, 0x10012 ; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010 ; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10016 -; GFX6-NEXT: s_bfe_u32 s63, s3, 0x1001a -; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10018 -; GFX6-NEXT: s_bfe_u32 s65, s3, 0x1001e -; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001c -; GFX6-NEXT: s_bfe_u32 s67, s3, 0x10014 -; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000c +; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014 +; GFX6-NEXT: s_bfe_u32 s64, s3, 0x1001a +; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018 +; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001e +; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001c +; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000a ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v1, s36 -; GFX6-NEXT: v_mov_b32_e32 v2, s65 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s64 -; GFX6-NEXT: v_mov_b32_e32 v5, s34 -; GFX6-NEXT: v_mov_b32_e32 v6, s63 -; GFX6-NEXT: v_mov_b32_e32 v7, s33 -; GFX6-NEXT: v_mov_b32_e32 v8, s67 -; GFX6-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s67 +; GFX6-NEXT: v_mov_b32_e32 v1, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s66 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v4, s65 +; GFX6-NEXT: v_mov_b32_e32 v5, s36 +; GFX6-NEXT: v_mov_b32_e32 v6, s64 +; GFX6-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NEXT: v_mov_b32_e32 v8, s63 +; GFX6-NEXT: v_mov_b32_e32 v9, s34 ; GFX6-NEXT: v_mov_b32_e32 v10, s62 -; GFX6-NEXT: v_mov_b32_e32 v11, s30 +; GFX6-NEXT: v_mov_b32_e32 v11, s33 ; GFX6-NEXT: v_mov_b32_e32 v12, s61 -; GFX6-NEXT: v_mov_b32_e32 v13, s29 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 ; GFX6-NEXT: v_mov_b32_e32 v14, s60 +; GFX6-NEXT: v_mov_b32_e32 v15, s30 +; GFX6-NEXT: v_mov_b32_e32 v16, s59 +; GFX6-NEXT: v_mov_b32_e32 v17, s29 +; GFX6-NEXT: v_mov_b32_e32 v18, s58 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s57 +; GFX6-NEXT: v_mov_b32_e32 v19, s28 +; GFX6-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s68 +; GFX6-NEXT: v_mov_b32_e32 v3, s26 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v15, s28 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NEXT: v_mov_b32_e32 v2, s59 -; GFX6-NEXT: v_mov_b32_e32 v3, s26 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s57 -; GFX6-NEXT: v_mov_b32_e32 v3, s24 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v1, s23 +; GFX6-NEXT: v_mov_b32_e32 v1, s25 ; GFX6-NEXT: v_mov_b32_e32 v2, s55 -; GFX6-NEXT: v_mov_b32_e32 v3, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s53 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v1, s23 ; GFX6-NEXT: v_mov_b32_e32 v2, s54 -; GFX6-NEXT: v_mov_b32_e32 v3, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s22 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v1, s19 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 ; GFX6-NEXT: v_mov_b32_e32 v2, s51 -; GFX6-NEXT: v_mov_b32_e32 v3, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NEXT: v_mov_b32_e32 v2, s49 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 ; GFX6-NEXT: v_mov_b32_e32 v2, s47 -; GFX6-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s45 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: v_mov_b32_e32 v3, s14 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -2853,9 +2852,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3446,59 +3445,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_i32 s46, s3, 0x1000a ; GFX6-NEXT: s_bfe_i32 s47, s3, 0x10009 ; GFX6-NEXT: s_bfe_i32 s48, s3, 0x10008 -; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000f -; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000e -; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000d -; GFX6-NEXT: s_bfe_i32 s52, s3, 0x1000c +; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000e +; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000d +; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000c +; GFX6-NEXT: s_bfe_i32 s52, s3, 0x10013 ; GFX6-NEXT: s_bfe_i32 s53, s3, 0x10012 ; GFX6-NEXT: s_bfe_i32 s54, s3, 0x10011 ; GFX6-NEXT: s_bfe_i32 s55, s3, 0x10010 ; GFX6-NEXT: s_bfe_i32 s56, s3, 0x10017 ; GFX6-NEXT: s_bfe_i32 s57, s3, 0x10016 ; GFX6-NEXT: s_bfe_i32 s58, s3, 0x10015 -; GFX6-NEXT: s_bfe_i32 s59, s3, 0x1001b -; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001a -; GFX6-NEXT: s_bfe_i32 s61, s3, 0x10019 -; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10018 -; GFX6-NEXT: s_ashr_i32 s63, s3, 31 -; GFX6-NEXT: s_bfe_i32 s64, s3, 0x1001e -; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001d -; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001c -; GFX6-NEXT: s_bfe_i32 s67, s3, 0x10014 -; GFX6-NEXT: s_bfe_i32 s68, s3, 0x10013 +; GFX6-NEXT: s_bfe_i32 s59, s3, 0x10014 +; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001b +; GFX6-NEXT: s_bfe_i32 s61, s3, 0x1001a +; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10019 +; GFX6-NEXT: s_bfe_i32 s63, s3, 0x10018 +; GFX6-NEXT: s_ashr_i32 s64, s3, 31 +; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001e +; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001d +; GFX6-NEXT: s_bfe_i32 s67, s3, 0x1001c +; GFX6-NEXT: s_bfe_i32 s68, s3, 0x1000f ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v1, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s64 -; GFX6-NEXT: v_mov_b32_e32 v3, s63 -; GFX6-NEXT: v_mov_b32_e32 v4, s62 -; GFX6-NEXT: v_mov_b32_e32 v5, s61 -; GFX6-NEXT: v_mov_b32_e32 v6, s60 -; GFX6-NEXT: v_mov_b32_e32 v7, s59 -; GFX6-NEXT: v_mov_b32_e32 v8, s67 +; GFX6-NEXT: v_mov_b32_e32 v0, s67 +; GFX6-NEXT: v_mov_b32_e32 v1, s66 +; GFX6-NEXT: v_mov_b32_e32 v2, s65 +; GFX6-NEXT: v_mov_b32_e32 v3, s64 +; GFX6-NEXT: v_mov_b32_e32 v4, s63 +; GFX6-NEXT: v_mov_b32_e32 v5, s62 +; GFX6-NEXT: v_mov_b32_e32 v6, s61 +; GFX6-NEXT: v_mov_b32_e32 v7, s60 +; GFX6-NEXT: v_mov_b32_e32 v8, s59 ; GFX6-NEXT: v_mov_b32_e32 v9, s58 ; GFX6-NEXT: v_mov_b32_e32 v10, s57 ; GFX6-NEXT: v_mov_b32_e32 v11, s56 ; GFX6-NEXT: v_mov_b32_e32 v12, s55 ; GFX6-NEXT: v_mov_b32_e32 v13, s54 ; GFX6-NEXT: v_mov_b32_e32 v14, s53 +; GFX6-NEXT: v_mov_b32_e32 v15, s52 +; GFX6-NEXT: v_mov_b32_e32 v16, s51 +; GFX6-NEXT: v_mov_b32_e32 v17, s50 +; GFX6-NEXT: v_mov_b32_e32 v18, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v15, s68 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s50 -; GFX6-NEXT: v_mov_b32_e32 v3, s49 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s48 +; GFX6-NEXT: v_mov_b32_e32 v19, s68 ; GFX6-NEXT: v_mov_b32_e32 v1, s47 ; GFX6-NEXT: v_mov_b32_e32 v2, s46 ; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s44 @@ -5099,40 +5097,40 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 +; GFX8-NEXT: v_mov_b32_e32 v15, s4 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 -; GFX8-NEXT: v_mov_b32_e32 v13, v1 -; GFX8-NEXT: v_mov_b32_e32 v15, v1 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 +; GFX8-NEXT: v_mov_b32_e32 v12, v1 +; GFX8-NEXT: v_mov_b32_e32 v14, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0 ; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1 ; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1 -; GFX8-NEXT: v_bfe_u32 v10, v0, 3, 1 -; GFX8-NEXT: v_bfe_u32 v14, v0, 1, 1 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 -; GFX8-NEXT: v_bfe_u32 v8, v0, 2, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v24 -; GFX8-NEXT: v_bfe_u32 v0, v24, 6, 1 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v16, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, s1 +; GFX8-NEXT: v_mov_b32_e32 v17, s0 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1 +; GFX8-NEXT: v_mov_b32_e32 v15, s2 +; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v6 +; GFX8-NEXT: v_bfe_u32 v0, v6, 6, 1 +; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[11:14] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i1_to_v8i64: @@ -5728,61 +5726,63 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 15, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 13, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 13, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v11, 10, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v12, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v13, 9, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 6, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 7, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 5, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 11, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v16, 9, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 5, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v16, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 1, v1 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 1 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 1 -; GFX6-NEXT: v_bfe_i32 v14, v13, 0, 1 -; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX6-NEXT: v_bfe_i32 v17, v5, 0, 1 -; GFX6-NEXT: v_bfe_i32 v15, v3, 0, 1 +; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 1 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_bfe_i32 v6, v10, 0, 1 +; GFX6-NEXT: v_bfe_i32 v4, v9, 0, 1 +; GFX6-NEXT: v_bfe_i32 v9, v8, 0, 1 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_bfe_i32 v9, v12, 0, 1 +; GFX6-NEXT: v_bfe_i32 v7, v11, 0, 1 +; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 1 +; GFX6-NEXT: v_bfe_i32 v11, v1, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 7, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_bfe_i32 v17, v1, 0, 1 +; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 1 ; GFX6-NEXT: v_bfe_i32 v21, v16, 0, 1 -; GFX6-NEXT: v_bfe_i32 v19, v1, 0, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 -; GFX6-NEXT: v_bfe_i32 v25, v1, 0, 1 -; GFX6-NEXT: v_bfe_i32 v23, v11, 0, 1 -; GFX6-NEXT: v_bfe_i32 v29, v9, 0, 1 -; GFX6-NEXT: v_bfe_i32 v27, v7, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX6-NEXT: v_bfe_i32 v19, v14, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX6-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX6-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX6-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GFX6-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GFX6-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 -; GFX6-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: @@ -5792,8 +5792,8 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v27, s1 -; GFX8-NEXT: v_mov_b32_e32 v26, s0 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 +; GFX8-NEXT: v_mov_b32_e32 v18, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_lshr_b32 s2, s3, 14 @@ -5831,70 +5831,70 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v23, s3 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_mov_b32_e32 v22, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s7 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mov_b32_e32 v9, s9 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s10 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NEXT: v_mov_b32_e32 v11, s11 ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: v_mov_b32_e32 v13, s13 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v14, s14 -; GFX8-NEXT: v_mov_b32_e32 v15, s15 -; GFX8-NEXT: v_mov_b32_e32 v16, s16 -; GFX8-NEXT: v_mov_b32_e32 v17, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[14:17] -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v18, s18 -; GFX8-NEXT: v_mov_b32_e32 v19, s19 -; GFX8-NEXT: v_mov_b32_e32 v20, s20 -; GFX8-NEXT: v_mov_b32_e32 v21, s21 +; GFX8-NEXT: v_mov_b32_e32 v7, s19 +; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v9, s21 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v22, s22 -; GFX8-NEXT: v_mov_b32_e32 v23, s23 -; GFX8-NEXT: v_mov_b32_e32 v24, s24 -; GFX8-NEXT: v_mov_b32_e32 v25, s25 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_mov_b32_e32 v10, s22 +; GFX8-NEXT: v_mov_b32_e32 v11, s23 +; GFX8-NEXT: v_mov_b32_e32 v12, s24 +; GFX8-NEXT: v_mov_b32_e32 v13, s25 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s26 -; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: v_mov_b32_e32 v6, s28 -; GFX8-NEXT: v_mov_b32_e32 v7, s29 -; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v14, s26 +; GFX8-NEXT: v_mov_b32_e32 v15, s27 +; GFX8-NEXT: v_mov_b32_e32 v16, s28 +; GFX8-NEXT: v_mov_b32_e32 v17, s29 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[14:17] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i1_to_v16i64: @@ -6607,164 +6607,164 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s52, s8, 30 -; GFX6-NEXT: s_lshr_b32 s46, s8, 31 -; GFX6-NEXT: s_lshr_b32 s48, s8, 28 -; GFX6-NEXT: s_lshr_b32 s36, s8, 29 -; GFX6-NEXT: s_lshr_b32 s38, s8, 26 -; GFX6-NEXT: s_lshr_b32 s26, s8, 27 -; GFX6-NEXT: s_lshr_b32 s28, s8, 24 -; GFX6-NEXT: s_lshr_b32 s4, s8, 25 -; GFX6-NEXT: s_lshr_b32 s6, s8, 22 -; GFX6-NEXT: s_lshr_b32 s10, s8, 23 -; GFX6-NEXT: s_lshr_b32 s12, s8, 20 -; GFX6-NEXT: s_lshr_b32 s14, s8, 21 -; GFX6-NEXT: s_lshr_b32 s16, s8, 18 -; GFX6-NEXT: s_lshr_b32 s18, s8, 19 -; GFX6-NEXT: s_lshr_b32 s20, s8, 16 -; GFX6-NEXT: s_lshr_b32 s22, s8, 17 -; GFX6-NEXT: s_lshr_b32 s24, s8, 14 -; GFX6-NEXT: s_lshr_b32 s30, s8, 15 -; GFX6-NEXT: s_lshr_b32 s34, s8, 12 -; GFX6-NEXT: s_lshr_b32 s40, s8, 13 -; GFX6-NEXT: s_lshr_b32 s42, s8, 10 -; GFX6-NEXT: s_lshr_b32 s44, s8, 11 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s8, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NEXT: s_lshr_b32 s52, s8, 9 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s46 -; GFX6-NEXT: v_mov_b32_e32 v5, s47 -; GFX6-NEXT: s_lshr_b32 s46, s8, 6 -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s8, 7 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_lshr_b32 s38, s4, 30 +; GFX6-NEXT: s_lshr_b32 s40, s4, 31 +; GFX6-NEXT: s_lshr_b32 s34, s4, 28 +; GFX6-NEXT: s_lshr_b32 s36, s4, 29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 26 +; GFX6-NEXT: s_lshr_b32 s30, s4, 27 +; GFX6-NEXT: s_lshr_b32 s24, s4, 24 +; GFX6-NEXT: s_lshr_b32 s26, s4, 25 +; GFX6-NEXT: s_lshr_b32 s20, s4, 22 +; GFX6-NEXT: s_lshr_b32 s22, s4, 23 +; GFX6-NEXT: s_lshr_b32 s18, s4, 20 +; GFX6-NEXT: s_lshr_b32 s6, s4, 21 +; GFX6-NEXT: s_lshr_b32 s8, s4, 18 +; GFX6-NEXT: s_lshr_b32 s10, s4, 19 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 +; GFX6-NEXT: s_lshr_b32 s14, s4, 17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 14 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_lshr_b32 s42, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 12 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 10 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s34 +; GFX6-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 11 ; GFX6-NEXT: v_mov_b32_e32 v8, s36 ; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s8, 4 -; GFX6-NEXT: v_mov_b32_e32 v10, s38 -; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s8, 5 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s26 -; GFX6-NEXT: v_mov_b32_e32 v13, s27 -; GFX6-NEXT: s_lshr_b32 s26, s8, 2 -; GFX6-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NEXT: s_lshr_b32 s28, s8, 3 -; GFX6-NEXT: s_lshr_b32 s8, s8, 1 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_lshr_b32 s36, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v10, s28 +; GFX6-NEXT: v_mov_b32_e32 v11, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 6 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s24 +; GFX6-NEXT: v_mov_b32_e32 v15, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v16, s26 +; GFX6-NEXT: v_mov_b32_e32 v17, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 4 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v4, s22 +; GFX6-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 2 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, s18 +; GFX6-NEXT: v_mov_b32_e32 v7, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 3 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v16, s4 -; GFX6-NEXT: v_mov_b32_e32 v17, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_mov_b32_e32 v9, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NEXT: v_mov_b32_e32 v4, s14 ; GFX6-NEXT: v_mov_b32_e32 v5, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NEXT: v_mov_b32_e32 v5, s19 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s24 -; GFX6-NEXT: v_mov_b32_e32 v3, s25 -; GFX6-NEXT: v_mov_b32_e32 v4, s30 -; GFX6-NEXT: v_mov_b32_e32 v5, s31 +; GFX6-NEXT: v_mov_b32_e32 v4, s42 +; GFX6-NEXT: v_mov_b32_e32 v5, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v4, s38 +; GFX6-NEXT: v_mov_b32_e32 v5, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: v_mov_b32_e32 v4, s44 -; GFX6-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v4, s34 +; GFX6-NEXT: v_mov_b32_e32 v5, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s50 -; GFX6-NEXT: v_mov_b32_e32 v3, s51 -; GFX6-NEXT: v_mov_b32_e32 v4, s52 -; GFX6-NEXT: v_mov_b32_e32 v5, s53 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 -; GFX6-NEXT: v_mov_b32_e32 v4, s48 -; GFX6-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NEXT: v_mov_b32_e32 v5, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NEXT: v_mov_b32_e32 v5, s39 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: v_mov_b32_e32 v4, s20 +; GFX6-NEXT: v_mov_b32_e32 v5, s21 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -7332,21 +7332,21 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b ; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d ; GFX6-NEXT: s_lshr_b32 s34, s2, 31 -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s47, s3, 0x10019 -; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s49, s3, 0x1001d -; GFX6-NEXT: s_lshr_b32 s50, s3, 31 +; GFX6-NEXT: s_bfe_u32 s35, s3, 0x10003 +; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10005 +; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10007 +; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10009 +; GFX6-NEXT: s_bfe_u32 s39, s3, 0x1000b +; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000d +; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000f +; GFX6-NEXT: s_bfe_u32 s42, s3, 0x10011 +; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10013 +; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10015 +; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10017 +; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10019 +; GFX6-NEXT: s_bfe_u32 s47, s3, 0x1001b +; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001d +; GFX6-NEXT: s_lshr_b32 s49, s3, 31 ; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001 ; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001 ; GFX6-NEXT: s_and_b32 s7, s2, 1 @@ -7362,7 +7362,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012 ; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014 ; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10018 +; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 ; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a ; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c ; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e @@ -7386,63 +7386,63 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v2, s50 +; GFX6-NEXT: v_mov_b32_e32 v2, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v2, s47 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s46 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 +; GFX6-NEXT: v_mov_b32_e32 v2, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NEXT: v_mov_b32_e32 v2, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v2, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NEXT: v_mov_b32_e32 v2, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s55 -; GFX6-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v2, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s53 @@ -7457,7 +7457,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v2, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s35 +; GFX6-NEXT: v_mov_b32_e32 v0, s50 ; GFX6-NEXT: v_mov_b32_e32 v2, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8347,478 +8347,477 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s48, s5, 30 -; GFX6-NEXT: s_lshr_b32 s46, s5, 28 -; GFX6-NEXT: s_lshr_b32 s44, s5, 29 -; GFX6-NEXT: s_lshr_b32 s40, s5, 26 -; GFX6-NEXT: s_lshr_b32 s42, s5, 27 -; GFX6-NEXT: s_lshr_b32 s36, s5, 24 -; GFX6-NEXT: s_lshr_b32 s38, s5, 25 -; GFX6-NEXT: s_lshr_b32 s30, s5, 22 -; GFX6-NEXT: s_lshr_b32 s34, s5, 23 -; GFX6-NEXT: s_lshr_b32 s26, s5, 20 -; GFX6-NEXT: s_lshr_b32 s28, s5, 21 -; GFX6-NEXT: s_lshr_b32 s22, s5, 18 -; GFX6-NEXT: s_lshr_b32 s24, s5, 19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 16 -; GFX6-NEXT: s_lshr_b32 s20, s5, 17 -; GFX6-NEXT: s_lshr_b32 s14, s5, 14 -; GFX6-NEXT: s_lshr_b32 s16, s5, 15 -; GFX6-NEXT: s_lshr_b32 s10, s5, 12 -; GFX6-NEXT: s_lshr_b32 s12, s5, 13 -; GFX6-NEXT: s_lshr_b32 s6, s5, 10 -; GFX6-NEXT: s_lshr_b32 s8, s5, 11 -; GFX6-NEXT: s_mov_b32 s50, s5 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[4:5], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s5, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s52 -; GFX6-NEXT: v_mov_b32_e32 v5, s53 -; GFX6-NEXT: s_lshr_b32 s52, s5, 9 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[46:47], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s46, s5, 6 -; GFX6-NEXT: v_mov_b32_e32 v10, s54 -; GFX6-NEXT: v_mov_b32_e32 v11, s55 -; GFX6-NEXT: s_lshr_b32 s48, s5, 7 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s42, s5, 30 +; GFX6-NEXT: s_lshr_b32 s36, s5, 28 +; GFX6-NEXT: s_lshr_b32 s38, s5, 29 +; GFX6-NEXT: s_lshr_b32 s30, s5, 26 +; GFX6-NEXT: s_lshr_b32 s34, s5, 27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 24 +; GFX6-NEXT: s_lshr_b32 s28, s5, 25 +; GFX6-NEXT: s_lshr_b32 s22, s5, 22 +; GFX6-NEXT: s_lshr_b32 s24, s5, 23 +; GFX6-NEXT: s_lshr_b32 s18, s5, 20 +; GFX6-NEXT: s_lshr_b32 s20, s5, 21 +; GFX6-NEXT: s_lshr_b32 s14, s5, 18 +; GFX6-NEXT: s_lshr_b32 s16, s5, 19 +; GFX6-NEXT: s_lshr_b32 s10, s5, 16 +; GFX6-NEXT: s_lshr_b32 s12, s5, 17 +; GFX6-NEXT: s_lshr_b32 s6, s5, 14 +; GFX6-NEXT: s_lshr_b32 s8, s5, 15 +; GFX6-NEXT: s_mov_b32 s40, s5 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: v_mov_b32_e32 v12, s44 -; GFX6-NEXT: v_mov_b32_e32 v13, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s40 -; GFX6-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NEXT: s_lshr_b32 s42, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s54 -; GFX6-NEXT: v_mov_b32_e32 v17, s55 -; GFX6-NEXT: s_lshr_b32 s40, s5, 2 -; GFX6-NEXT: v_mov_b32_e32 v8, s7 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s7 +; GFX6-NEXT: s_lshr_b32 s40, s5, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s44 +; GFX6-NEXT: v_mov_b32_e32 v7, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 13 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 10 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v9, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:496 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s36 -; GFX6-NEXT: v_mov_b32_e32 v7, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 3 -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s38 +; GFX6-NEXT: v_mov_b32_e32 v11, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 8 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s30 -; GFX6-NEXT: v_mov_b32_e32 v11, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 30 -; GFX6-NEXT: v_mov_b32_e32 v12, s34 -; GFX6-NEXT: v_mov_b32_e32 v13, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 31 +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s5, 9 +; GFX6-NEXT: v_mov_b32_e32 v14, s34 +; GFX6-NEXT: v_mov_b32_e32 v15, s35 +; GFX6-NEXT: s_lshr_b32 s34, s5, 6 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:464 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s26 -; GFX6-NEXT: v_mov_b32_e32 v15, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 28 -; GFX6-NEXT: v_mov_b32_e32 v16, s28 -; GFX6-NEXT: v_mov_b32_e32 v17, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 29 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: s_lshr_b32 s28, s5, 4 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s22 -; GFX6-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 26 -; GFX6-NEXT: v_mov_b32_e32 v8, s24 -; GFX6-NEXT: v_mov_b32_e32 v9, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[20:21], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s22 +; GFX6-NEXT: v_mov_b32_e32 v9, s23 +; GFX6-NEXT: s_lshr_b32 s22, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v10, s24 +; GFX6-NEXT: v_mov_b32_e32 v11, s25 +; GFX6-NEXT: s_lshr_b32 s24, s5, 2 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:432 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NEXT: s_lshr_b32 s20, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v12, s54 -; GFX6-NEXT: v_mov_b32_e32 v13, s55 -; GFX6-NEXT: s_lshr_b32 s18, s4, 25 +; GFX6-NEXT: v_mov_b32_e32 v12, s18 +; GFX6-NEXT: v_mov_b32_e32 v13, s19 +; GFX6-NEXT: s_lshr_b32 s18, s5, 3 +; GFX6-NEXT: v_mov_b32_e32 v14, s20 +; GFX6-NEXT: v_mov_b32_e32 v15, s21 +; GFX6-NEXT: s_lshr_b32 s20, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:416 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 22 -; GFX6-NEXT: v_mov_b32_e32 v16, s16 -; GFX6-NEXT: v_mov_b32_e32 v17, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 23 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 30 +; GFX6-NEXT: v_mov_b32_e32 v4, s16 +; GFX6-NEXT: v_mov_b32_e32 v5, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 31 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_mov_b32_e32 v7, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 20 -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 21 +; GFX6-NEXT: v_mov_b32_e32 v8, s10 +; GFX6-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 28 +; GFX6-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 29 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:384 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NEXT: s_lshr_b32 s46, s4, 26 +; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 27 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: s_lshr_b32 s44, s4, 25 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 22 ; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s6, s4, 18 -; GFX6-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 19 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:368 +; GFX6-NEXT: s_lshr_b32 s42, s4, 23 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 20 +; GFX6-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NEXT: s_lshr_b32 s6, s4, 21 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v16, s34 +; GFX6-NEXT: v_mov_b32_e32 v17, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 18 +; GFX6-NEXT: v_mov_b32_e32 v18, s26 +; GFX6-NEXT: v_mov_b32_e32 v19, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, s28 +; GFX6-NEXT: v_mov_b32_e32 v9, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 17 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s50 -; GFX6-NEXT: v_mov_b32_e32 v15, s51 -; GFX6-NEXT: s_lshr_b32 s50, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, s52 -; GFX6-NEXT: v_mov_b32_e32 v17, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 17 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v12, s24 +; GFX6-NEXT: v_mov_b32_e32 v13, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 14 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s18 +; GFX6-NEXT: v_mov_b32_e32 v15, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 12 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v18, s16 +; GFX6-NEXT: v_mov_b32_e32 v19, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 10 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s46 -; GFX6-NEXT: v_mov_b32_e32 v7, s47 -; GFX6-NEXT: s_lshr_b32 s46, s4, 14 -; GFX6-NEXT: v_mov_b32_e32 v8, s48 -; GFX6-NEXT: v_mov_b32_e32 v9, s49 -; GFX6-NEXT: s_lshr_b32 s48, s4, 15 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[44:45], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:336 +; GFX6-NEXT: v_mov_b32_e32 v8, s10 +; GFX6-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: v_mov_b32_e32 v12, s54 -; GFX6-NEXT: v_mov_b32_e32 v13, s55 -; GFX6-NEXT: s_lshr_b32 s44, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NEXT: v_mov_b32_e32 v13, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 6 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:320 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s40 -; GFX6-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v16, s36 ; GFX6-NEXT: v_mov_b32_e32 v17, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_lshr_b32 s36, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v18, s42 +; GFX6-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 2 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:304 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NEXT: v_mov_b32_e32 v7, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v8, s34 -; GFX6-NEXT: v_mov_b32_e32 v9, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:288 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v12, s28 -; GFX6-NEXT: v_mov_b32_e32 v13, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:272 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s24 -; GFX6-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v8, s30 +; GFX6-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 3 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v11, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(2) +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 -; GFX6-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NEXT: v_mov_b32_e32 v0, s24 +; GFX6-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s38 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NEXT: v_mov_b32_e32 v9, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s90, -1 -; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; GFX8-NEXT: s_add_u32 s88, s88, s11 -; GFX8-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s0, s3, 8 +; GFX8-NEXT: s_lshr_b32 s48, s3, 15 ; GFX8-NEXT: v_writelane_b32 v62, s0, 0 -; GFX8-NEXT: v_writelane_b32 v62, s1, 1 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: s_lshr_b32 s36, s3, 21 -; GFX8-NEXT: s_lshr_b32 s30, s3, 19 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 ; GFX8-NEXT: s_lshr_b32 s74, s3, 30 -; GFX8-NEXT: s_lshr_b32 s50, s3, 31 +; GFX8-NEXT: s_lshr_b32 s30, s3, 31 ; GFX8-NEXT: s_lshr_b32 s72, s3, 28 -; GFX8-NEXT: s_lshr_b32 s48, s3, 29 +; GFX8-NEXT: s_lshr_b32 s34, s3, 29 ; GFX8-NEXT: s_lshr_b32 s70, s3, 26 -; GFX8-NEXT: s_lshr_b32 s46, s3, 27 +; GFX8-NEXT: s_lshr_b32 s36, s3, 27 ; GFX8-NEXT: s_lshr_b32 s68, s3, 24 -; GFX8-NEXT: s_lshr_b32 s42, s3, 25 -; GFX8-NEXT: s_lshr_b32 s66, s3, 22 +; GFX8-NEXT: s_lshr_b32 s38, s3, 25 +; GFX8-NEXT: s_lshr_b32 s64, s3, 22 ; GFX8-NEXT: s_lshr_b32 s40, s3, 23 -; GFX8-NEXT: s_lshr_b32 s64, s3, 20 -; GFX8-NEXT: s_lshr_b32 s62, s3, 18 +; GFX8-NEXT: s_lshr_b32 s60, s3, 20 +; GFX8-NEXT: s_lshr_b32 s42, s3, 21 +; GFX8-NEXT: s_lshr_b32 s66, s3, 18 +; GFX8-NEXT: s_lshr_b32 s44, s3, 19 ; GFX8-NEXT: s_lshr_b32 s56, s3, 16 -; GFX8-NEXT: s_lshr_b32 s18, s3, 17 +; GFX8-NEXT: s_lshr_b32 s46, s3, 17 ; GFX8-NEXT: s_lshr_b32 s58, s3, 14 -; GFX8-NEXT: s_lshr_b32 s38, s3, 15 -; GFX8-NEXT: s_lshr_b32 s60, s3, 12 -; GFX8-NEXT: s_lshr_b32 s44, s3, 13 +; GFX8-NEXT: s_lshr_b32 s62, s3, 12 ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX8-NEXT: v_writelane_b32 v62, s0, 2 +; GFX8-NEXT: v_writelane_b32 v62, s1, 1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 9 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 -; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX8-NEXT: v_writelane_b32 v62, s0, 2 +; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v18, s36 -; GFX8-NEXT: v_mov_b32_e32 v19, s37 -; GFX8-NEXT: v_mov_b32_e32 v26, s30 -; GFX8-NEXT: v_mov_b32_e32 v27, s31 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[44:45], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v34, s48 +; GFX8-NEXT: s_lshr_b32 s48, s2, 1 +; GFX8-NEXT: s_lshr_b32 s50, s3, 13 ; GFX8-NEXT: v_writelane_b32 v62, s1, 3 -; GFX8-NEXT: s_lshr_b32 s6, s3, 9 -; GFX8-NEXT: s_lshr_b32 s8, s3, 6 +; GFX8-NEXT: s_lshr_b32 s6, s3, 6 ; GFX8-NEXT: s_lshr_b32 s10, s3, 7 ; GFX8-NEXT: s_lshr_b32 s12, s3, 4 ; GFX8-NEXT: s_lshr_b32 s14, s3, 5 ; GFX8-NEXT: s_lshr_b32 s16, s3, 2 -; GFX8-NEXT: s_lshr_b32 s20, s3, 3 -; GFX8-NEXT: s_lshr_b32 s22, s3, 1 -; GFX8-NEXT: s_mov_b32 s24, s3 -; GFX8-NEXT: s_lshr_b32 s26, s2, 30 -; GFX8-NEXT: s_lshr_b32 s28, s2, 31 -; GFX8-NEXT: s_lshr_b32 s34, s2, 28 +; GFX8-NEXT: s_lshr_b32 s18, s3, 3 +; GFX8-NEXT: s_lshr_b32 s20, s3, 1 +; GFX8-NEXT: s_mov_b32 s22, s3 +; GFX8-NEXT: s_lshr_b32 s24, s2, 30 +; GFX8-NEXT: s_lshr_b32 s26, s2, 31 +; GFX8-NEXT: s_lshr_b32 s28, s2, 28 ; GFX8-NEXT: v_mov_b32_e32 v4, s74 -; GFX8-NEXT: v_mov_b32_e32 v8, s72 +; GFX8-NEXT: v_mov_b32_e32 v12, s72 ; GFX8-NEXT: v_mov_b32_e32 v0, s70 -; GFX8-NEXT: v_mov_b32_e32 v54, s68 -; GFX8-NEXT: v_mov_b32_e32 v20, s66 +; GFX8-NEXT: v_mov_b32_e32 v8, s68 ; GFX8-NEXT: v_mov_b32_e32 v16, s64 -; GFX8-NEXT: v_mov_b32_e32 v24, s62 +; GFX8-NEXT: v_mov_b32_e32 v20, s60 +; GFX8-NEXT: v_mov_b32_e32 v24, s66 ; GFX8-NEXT: v_mov_b32_e32 v28, s56 ; GFX8-NEXT: v_mov_b32_e32 v32, s58 -; GFX8-NEXT: v_mov_b32_e32 v36, s60 +; GFX8-NEXT: v_mov_b32_e32 v36, s62 ; GFX8-NEXT: s_lshr_b32 s86, s2, 29 ; GFX8-NEXT: v_mov_b32_e32 v40, s54 ; GFX8-NEXT: s_lshr_b32 s84, s2, 26 ; GFX8-NEXT: s_lshr_b32 s82, s2, 27 +; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 ; GFX8-NEXT: s_lshr_b32 s80, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v6, s50 +; GFX8-NEXT: v_mov_b32_e32 v6, s30 +; GFX8-NEXT: v_mov_b32_e32 v7, s31 ; GFX8-NEXT: s_lshr_b32 s78, s2, 25 ; GFX8-NEXT: s_lshr_b32 s76, s2, 22 -; GFX8-NEXT: v_mov_b32_e32 v10, s48 +; GFX8-NEXT: v_mov_b32_e32 v14, s34 ; GFX8-NEXT: s_lshr_b32 s74, s2, 23 ; GFX8-NEXT: s_lshr_b32 s72, s2, 20 -; GFX8-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NEXT: v_mov_b32_e32 v2, s36 ; GFX8-NEXT: s_lshr_b32 s70, s2, 21 ; GFX8-NEXT: s_lshr_b32 s68, s2, 18 -; GFX8-NEXT: v_mov_b32_e32 v56, s42 +; GFX8-NEXT: v_mov_b32_e32 v10, s38 ; GFX8-NEXT: s_lshr_b32 s66, s2, 19 ; GFX8-NEXT: s_lshr_b32 s64, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v22, s40 +; GFX8-NEXT: v_mov_b32_e32 v18, s40 ; GFX8-NEXT: s_lshr_b32 s62, s2, 17 ; GFX8-NEXT: s_lshr_b32 s60, s2, 14 +; GFX8-NEXT: v_mov_b32_e32 v22, s42 ; GFX8-NEXT: s_lshr_b32 s58, s2, 15 ; GFX8-NEXT: s_lshr_b32 s56, s2, 12 +; GFX8-NEXT: v_mov_b32_e32 v26, s44 ; GFX8-NEXT: s_lshr_b32 s54, s2, 13 -; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s2, 10 -; GFX8-NEXT: v_mov_b32_e32 v30, s18 -; GFX8-NEXT: v_mov_b32_e32 v31, s19 -; GFX8-NEXT: s_lshr_b32 s50, s2, 11 -; GFX8-NEXT: s_lshr_b32 s48, s2, 8 -; GFX8-NEXT: v_mov_b32_e32 v34, s36 +; GFX8-NEXT: v_mov_b32_e32 v30, s46 +; GFX8-NEXT: s_lshr_b32 s4, s2, 11 +; GFX8-NEXT: s_lshr_b32 s0, s2, 8 ; GFX8-NEXT: s_lshr_b32 s46, s2, 9 ; GFX8-NEXT: s_lshr_b32 s44, s2, 6 -; GFX8-NEXT: v_mov_b32_e32 v38, s30 ; GFX8-NEXT: s_lshr_b32 s42, s2, 7 ; GFX8-NEXT: s_lshr_b32 s40, s2, 4 ; GFX8-NEXT: s_lshr_b32 s38, s2, 5 ; GFX8-NEXT: s_lshr_b32 s36, s2, 2 -; GFX8-NEXT: s_lshr_b32 s30, s2, 3 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000 +; GFX8-NEXT: s_lshr_b32 s34, s2, 3 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000 +; GFX8-NEXT: v_writelane_b32 v62, s2, 4 +; GFX8-NEXT: v_writelane_b32 v62, s3, 5 +; GFX8-NEXT: v_readlane_b32 s2, v62, 2 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX8-NEXT: v_readlane_b32 s3, v62, 3 +; GFX8-NEXT: v_mov_b32_e32 v38, s50 +; GFX8-NEXT: v_mov_b32_e32 v39, s51 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 -; GFX8-NEXT: v_mov_b32_e32 v7, s51 -; GFX8-NEXT: v_mov_b32_e32 v9, s73 -; GFX8-NEXT: v_mov_b32_e32 v11, s49 +; GFX8-NEXT: v_mov_b32_e32 v13, s73 +; GFX8-NEXT: v_mov_b32_e32 v15, s35 ; GFX8-NEXT: v_mov_b32_e32 v1, s71 -; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: v_mov_b32_e32 v55, s69 -; GFX8-NEXT: v_mov_b32_e32 v57, s43 -; GFX8-NEXT: v_mov_b32_e32 v21, s67 -; GFX8-NEXT: v_mov_b32_e32 v23, s41 +; GFX8-NEXT: v_mov_b32_e32 v3, s37 +; GFX8-NEXT: v_mov_b32_e32 v9, s69 +; GFX8-NEXT: v_mov_b32_e32 v11, s39 ; GFX8-NEXT: v_mov_b32_e32 v17, s65 -; GFX8-NEXT: v_mov_b32_e32 v25, s63 +; GFX8-NEXT: v_mov_b32_e32 v19, s41 +; GFX8-NEXT: v_mov_b32_e32 v21, s61 +; GFX8-NEXT: v_mov_b32_e32 v23, s43 +; GFX8-NEXT: v_mov_b32_e32 v25, s67 +; GFX8-NEXT: v_mov_b32_e32 v27, s45 ; GFX8-NEXT: v_mov_b32_e32 v29, s57 +; GFX8-NEXT: v_mov_b32_e32 v31, s47 ; GFX8-NEXT: v_mov_b32_e32 v33, s59 -; GFX8-NEXT: v_mov_b32_e32 v35, s37 -; GFX8-NEXT: v_mov_b32_e32 v37, s61 -; GFX8-NEXT: v_mov_b32_e32 v39, s31 +; GFX8-NEXT: v_mov_b32_e32 v35, s49 +; GFX8-NEXT: v_mov_b32_e32 v37, s63 ; GFX8-NEXT: v_mov_b32_e32 v41, s55 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 @@ -8837,269 +8836,262 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1f0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x10000 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1f0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: v_mov_b32_e32 v42, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1e0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1e0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1d0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1d0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: v_mov_b32_e32 v46, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1c0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: v_mov_b32_e32 v48, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1b0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1b0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: v_mov_b32_e32 v50, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1a0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: v_mov_b32_e32 v52, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x190 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x180 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[54:57] -; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[20:23] -; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[16:19] -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27] -; GFX8-NEXT: buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_add_u32 s2, s4, 0x170 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x190 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v55, s3 +; GFX8-NEXT: v_mov_b32_e32 v54, s2 +; GFX8-NEXT: s_add_u32 s2, s8, 0x180 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v57, s3 +; GFX8-NEXT: v_mov_b32_e32 v56, s2 +; GFX8-NEXT: s_add_u32 s2, s8, 0x170 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: v_mov_b32_e32 v58, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x160 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x160 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: v_mov_b32_e32 v60, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x150 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v45, s3 -; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x140 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x130 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x150 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: s_add_u32 s2, s8, 0x140 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x120 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x110 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x120 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 +; GFX8-NEXT: v_mov_b32_e32 v18, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] ; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: v_mov_b32_e32 v10, s14 ; GFX8-NEXT: v_mov_b32_e32 v11, s15 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[28:31] +; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31] ; GFX8-NEXT: flat_store_dwordx4 v[58:59], v[32:35] ; GFX8-NEXT: flat_store_dwordx4 v[60:61], v[36:39] -; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[40:43] -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[40:43] +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x100 +; GFX8-NEXT: s_add_u32 s0, s8, 0x100 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xf0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 -; GFX8-NEXT: v_mov_b32_e32 v2, s22 -; GFX8-NEXT: v_mov_b32_e32 v3, s23 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xe0 -; GFX8-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NEXT: v_mov_b32_e32 v1, s27 -; GFX8-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NEXT: v_mov_b32_e32 v3, s29 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v0, s34 -; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: s_add_u32 s0, s8, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s86 ; GFX8-NEXT: v_mov_b32_e32 v3, s87 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xc0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s84 ; GFX8-NEXT: v_mov_b32_e32 v1, s85 ; GFX8-NEXT: v_mov_b32_e32 v2, s82 ; GFX8-NEXT: v_mov_b32_e32 v3, s83 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xb0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s80 ; GFX8-NEXT: v_mov_b32_e32 v1, s81 ; GFX8-NEXT: v_mov_b32_e32 v2, s78 ; GFX8-NEXT: v_mov_b32_e32 v3, s79 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xa0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s76 ; GFX8-NEXT: v_mov_b32_e32 v1, s77 ; GFX8-NEXT: v_mov_b32_e32 v2, s74 ; GFX8-NEXT: v_mov_b32_e32 v3, s75 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x90 +; GFX8-NEXT: s_add_u32 s0, s8, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s72 ; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v2, s70 ; GFX8-NEXT: v_mov_b32_e32 v3, s71 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x80 +; GFX8-NEXT: s_add_u32 s0, s8, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NEXT: v_mov_b32_e32 v2, s66 ; GFX8-NEXT: v_mov_b32_e32 v3, s67 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x70 +; GFX8-NEXT: s_add_u32 s0, s8, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x60 +; GFX8-NEXT: s_add_u32 s0, s8, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x50 +; GFX8-NEXT: s_add_u32 s0, s8, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 64 +; GFX8-NEXT: s_add_u32 s0, s8, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 48 +; GFX8-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 32 +; GFX8-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 16 +; GFX8-NEXT: s_add_u32 s0, s8, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s41 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NEXT: v_mov_b32_e32 v2, s30 -; GFX8-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_readlane_b32 s0, v62, 2 +; GFX8-NEXT: v_readlane_b32 s0, v62, 4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_readlane_b32 s1, v62, 3 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_readlane_b32 s1, v62, 5 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NEXT: v_mov_b32_e32 v1, s31 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index bb98af4e7a5c7f..255a1acbe0086f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -637,8 +637,8 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: flat_load_ushort v19, v[6:7] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v20, v[8:9] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[10:11] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[12:13] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v23, v[14:15] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[12:13] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[14:15] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -664,18 +664,18 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0 -; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v24, v[2:3] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v14, v[0:1] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v15, v[2:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v5, v[6:7] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v8, v[8:9] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v9, v[10:11] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[12:13] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v11, v[14:15] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[2:3] ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v17, v1 @@ -688,25 +688,25 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10) ; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v21, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(9) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v23, v6 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v13, v6 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v24, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v15, v6 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v5, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v11, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v10, v0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -2502,29 +2502,27 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff @@ -2534,56 +2532,60 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: @@ -2622,32 +2624,32 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s0, s15, 0xffff -; GCN-HSA-NEXT: s_and_b32 s1, s14, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 @@ -2981,88 +2983,90 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s0 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s4, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s10, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s12, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: @@ -3073,8 +3077,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 ; GCN-HSA-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-HSA-NEXT: s_ashr_i32 s23, s2, 16 ; GCN-HSA-NEXT: s_ashr_i32 s24, s5, 16 @@ -3087,34 +3089,36 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i32 s31, s10, 16 ; GCN-HSA-NEXT: s_ashr_i32 s33, s13, 16 ; GCN-HSA-NEXT: s_ashr_i32 s34, s12, 16 -; GCN-HSA-NEXT: s_ashr_i32 s0, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s1, s14, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: s_ashr_i32 s35, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s36, s14, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 @@ -3524,18 +3528,18 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -3555,22 +3559,21 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -3652,10 +3655,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s44, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s47, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff @@ -3664,13 +3667,13 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s42, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s44, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s46, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s41, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s50, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s51, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s51, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s50, s15, 0xffff ; GCN-HSA-NEXT: s_and_b32 s52, s14, 0xffff ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3708,111 +3711,111 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s54 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s53 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3854,57 +3857,34 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s17, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s16, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s21, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s20, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s23, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s22, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s25, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s24, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s27, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s26, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s29, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s28, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s31, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s30, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s18, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s21, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s20, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s23, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s22, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s25, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s24, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s27, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s26, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s29, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s28, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s9, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff @@ -3919,151 +3899,170 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s31, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s30, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s14, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xe0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xd0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xc0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xb0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xa0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x90 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x80 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x70 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x60 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x50 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s36, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s36, 32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s36, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4437,16 +4436,17 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s6, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s6, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s62, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s8, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s9 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s8, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s11, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s13, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 @@ -4455,8 +4455,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s7, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -4474,24 +4473,23 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 @@ -4586,10 +4584,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_sext_i32_i16 s43, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s44, s10 ; GCN-HSA-NEXT: s_ashr_i32 s45, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s46, s12, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s47, s13 -; GCN-HSA-NEXT: s_sext_i32_i16 s48, s12 -; GCN-HSA-NEXT: s_ashr_i32 s49, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s47, s12, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s46, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s49, s12 +; GCN-HSA-NEXT: s_ashr_i32 s48, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s50, s14, 16 ; GCN-HSA-NEXT: s_sext_i32_i16 s51, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s52, s14 @@ -4597,8 +4595,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-HSA-NEXT: s_ashr_i32 s55, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s53, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s54, s2, 16 ; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 ; GCN-HSA-NEXT: s_ashr_i32 s58, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 16 @@ -4611,114 +4609,114 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i32 s66, s12, 16 ; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s68, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s54, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_sext_i32_i16 s53, s1 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s56, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 +; GCN-HSA-NEXT: s_sext_i32_i16 s55, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 +; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s49 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 @@ -7033,104 +7031,102 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s16, s15 -; GCN-HSA-NEXT: s_mov_b32 s18, s13 -; GCN-HSA-NEXT: s_mov_b32 s20, s11 -; GCN-HSA-NEXT: s_mov_b32 s22, s9 -; GCN-HSA-NEXT: s_lshr_b32 s24, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 +; GCN-HSA-NEXT: s_mov_b32 s6, s19 +; GCN-HSA-NEXT: s_mov_b32 s10, s17 +; GCN-HSA-NEXT: s_mov_b32 s20, s15 +; GCN-HSA-NEXT: s_mov_b32 s22, s13 +; GCN-HSA-NEXT: s_lshr_b32 s24, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[18:19], s[18:19], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[16:17], s[16:17], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[28:29], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s28, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s29, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -7403,106 +7399,108 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: @@ -7513,141 +7511,142 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 ; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff -; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff -; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s2, s15, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -8091,144 +8090,140 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s8, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[70:71], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[60:61], s[0:1], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[48:49], s[0:1], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[58:59], s[2:3], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s71 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[54:55], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s50 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s74 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s75 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8237,13 +8232,13 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s42, s15 +; GCN-HSA-NEXT: s_mov_b32 s40, s15 ; GCN-HSA-NEXT: s_mov_b32 s48, s13 ; GCN-HSA-NEXT: s_mov_b32 s50, s11 ; GCN-HSA-NEXT: s_mov_b32 s52, s9 ; GCN-HSA-NEXT: s_mov_b32 s54, s7 ; GCN-HSA-NEXT: s_mov_b32 s56, s5 -; GCN-HSA-NEXT: s_mov_b32 s46, s3 +; GCN-HSA-NEXT: s_mov_b32 s44, s3 ; GCN-HSA-NEXT: s_mov_b32 s58, s1 ; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 ; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 @@ -8258,15 +8253,15 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[40:41], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[44:45], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 @@ -8282,8 +8277,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 @@ -8299,84 +8294,82 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 ; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s42 +; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s43 +; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 +; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s37 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 @@ -8441,208 +8434,211 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s1, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[20:21], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[26:27], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s13 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s0, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s2, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s60, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s12, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s68, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s74, s14, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s76, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s78, s15, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s13, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s15, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xd0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[72:73], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[68:69], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[70:71], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[74:75], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[76:77], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[78:79], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 +; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 -; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xc0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 +; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xb0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x90 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x80 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 +; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 +; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 6eeaec12c3d148..341332e60b5c0d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -2713,37 +2713,39 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s3 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s33 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s30 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:80 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:64 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s26 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[16:19], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[16:19], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s22 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s20 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: @@ -2752,97 +2754,91 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s20, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s21, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s22, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s23, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s24, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s25, s6, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s26, s9, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s8, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s28, s11, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s10, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s13, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s12, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s15, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s14, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s20, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s21, s0, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s22, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s26, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s6, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s28, s9, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s8, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s30, s11, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s31, s10, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s13, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s12, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s35, s15, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 0x70 +; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[9:12] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -3500,137 +3496,135 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s38, -1 -; GFX6-NOHSA-NEXT: s_mov_b32 s36, s16 -; GFX6-NOHSA-NEXT: s_mov_b32 s37, s17 -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[18:19], 0x10 +; GFX6-NOHSA-NEXT: s_mov_b32 s36, s0 +; GFX6-NOHSA-NEXT: s_mov_b32 s37, s1 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s0, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s2, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s4, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s6, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s45, s17, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s46, s16, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s47, s19, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s48, s18, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s21, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s50, s20, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s51, s23, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s30, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s31, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s28, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s29, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s26, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s27, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s22, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s25, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s17, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s16, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s19, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s18, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s21, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s20, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s30, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s31, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s28, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s44 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s29, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s23, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s44 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s22, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s16 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s25, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s27, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s26, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s24, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s17 -; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s24, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s9, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s8, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s11, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s10, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s13, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s12, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s15, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s14, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s53 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s0, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s2, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s4, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s6, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s9, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s8, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s10, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s13, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s12, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s22 +; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s15, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s16 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s14, 31 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s51 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:128 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s33 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s22 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s30 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s26 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s24 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -3646,45 +3640,45 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s26, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s6, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s28, s9, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s8, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s11, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s10, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s13, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s12, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s35, s15, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GFX7-HSA-NEXT: s_ashr_i32 s28, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s6, 31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s9, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s37, s8, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s38, s11, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s39, s10, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s40, s13, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s12, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s42, s15, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s14, 31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s1 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s38, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s39, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s40, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s42, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s26, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s30, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s31, s4, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s44, s6, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s45, s9, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s46, s8, 31 @@ -3694,105 +3688,101 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_ashr_i32 s50, s12, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s51, s15, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s52, s14, 31 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xd0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s28 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x90 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x80 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[28:31] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 +; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 +; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 +; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 +; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47 +; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -3801,8 +3791,8 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -3811,15 +3801,15 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -4193,43 +4183,37 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_ashr_i32 s65, s31, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s66, s30, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s63, s29, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s64, s28, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s30, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s31, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s28, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s29, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s26, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v8, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s27, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s24, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v11, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s25, 31 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s66 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s65 -; GFX9-HSA-NEXT: s_ashr_i32 s61, s27, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s62, s26, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 -; GFX9-HSA-NEXT: s_ashr_i32 s59, s25, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s28 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s64 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s63 -; GFX9-HSA-NEXT: s_ashr_i32 s60, s24, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:224 ; GFX9-HSA-NEXT: s_ashr_i32 s57, s23, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s62 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s61 +; GFX9-HSA-NEXT: v_mov_b32_e32 v13, s58 ; GFX9-HSA-NEXT: s_ashr_i32 s58, s22, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:208 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 +; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s29 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX9-HSA-NEXT: s_ashr_i32 s55, s21, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s59 ; GFX9-HSA-NEXT: s_ashr_i32 s56, s20, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:192 -; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[4:7], s[36:37] offset:224 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s22 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s57 +; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 ; GFX9-HSA-NEXT: s_ashr_i32 s54, s18, 31 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:176 ; GFX9-HSA-NEXT: s_ashr_i32 s51, s17, 31 @@ -4294,14 +4278,18 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s39 ; GFX9-HSA-NEXT: s_ashr_i32 s34, s0, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s26 +; GFX9-HSA-NEXT: v_mov_b32_e32 v9, s27 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:32 -; GFX9-HSA-NEXT: s_nop 0 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[7:10], s[36:37] offset:208 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s35 +; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-HSA-NEXT: v_mov_b32_e32 v12, s25 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:16 -; GFX9-HSA-NEXT: s_nop 0 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[10:13], s[36:37] offset:192 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s1 @@ -4496,64 +4484,64 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s36, 0xe0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: s_add_u32 s26, s36, 0xd0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GFX7-HSA-NEXT: s_addc_u32 s27, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s36, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: s_addc_u32 s23, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s36, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: s_addc_u32 s19, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: s_add_u32 s16, s36, 0x80 @@ -4562,7 +4550,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: s_add_u32 s14, s36, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 @@ -5111,53 +5099,52 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s35 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: s_add_u32 s16, s36, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: s_addc_u32 s17, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s36, 48 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s37, 0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 @@ -5165,9 +5152,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s36, 32 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s37, 0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 @@ -5175,20 +5162,20 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s36, 16 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: s_addc_u32 s5, s37, 0 +; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 102c33ec31b09d..b3e75e767ae641 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -638,53 +638,52 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 -; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GFX7-NEXT: s_add_u32 s34, s36, 0x70 -; GFX7-NEXT: s_addc_u32 s35, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s34 -; GFX7-NEXT: v_mov_b32_e32 v6, s35 +; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-NEXT: s_add_u32 s24, s36, 0x60 -; GFX7-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX7-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-NEXT: s_addc_u32 s25, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-NEXT: v_mov_b32_e32 v6, s26 -; GFX7-NEXT: v_mov_b32_e32 v7, s27 -; GFX7-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-NEXT: v_mov_b32_e32 v8, s4 +; GFX7-NEXT: v_mov_b32_e32 v9, s5 +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-NEXT: v_mov_b32_e32 v13, s1 +; GFX7-NEXT: v_mov_b32_e32 v14, s2 +; GFX7-NEXT: v_mov_b32_e32 v15, s3 +; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX7-NEXT: s_add_u32 s18, s16, 0x70 +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: s_add_u32 s18, s16, 0x50 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: s_add_u32 s20, s36, 0x50 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: s_addc_u32 s21, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-NEXT: s_nop 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: s_add_u32 s16, s36, 64 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: s_addc_u32 s17, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: s_add_u32 s18, s16, 64 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-NEXT: s_add_u32 s12, s36, 48 +; GFX7-NEXT: s_add_u32 s12, s16, 48 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: s_addc_u32 s13, s37, 0 +; GFX7-NEXT: s_addc_u32 s13, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 @@ -692,9 +691,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: s_add_u32 s8, s36, 32 +; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_addc_u32 s9, s37, 0 +; GFX7-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 @@ -702,20 +701,20 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_add_u32 s4, s36, 16 +; GFX7-NEXT: s_add_u32 s4, s16, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_addc_u32 s5, s37, 0 +; GFX7-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-NEXT: v_mov_b32_e32 v4, s36 +; GFX7-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index ff55ab8859c833..efc31fbd5ed9ee 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2391,48 +2391,48 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s23, s9, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s24, s10, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s25, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s2, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s3, s11, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s26, s4, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s26, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s27, s11, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s28, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff +; GFX7-HSA-NEXT: s_and_b32 s29, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff +; GFX7-HSA-NEXT: s_and_b32 s30, s6, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s31, s7, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s30, s8, 0xff +; GFX7-HSA-NEXT: s_and_b32 s33, s8, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s31, s9, 0xff +; GFX7-HSA-NEXT: s_and_b32 s34, s9, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff +; GFX7-HSA-NEXT: s_and_b32 s35, s10, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s34, s11, 0xff +; GFX7-HSA-NEXT: s_and_b32 s36, s11, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 @@ -2441,7 +2441,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 @@ -2450,7 +2450,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 @@ -2459,21 +2459,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 @@ -2880,33 +2880,33 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s30, s10, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s31, s10, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s33, s10, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s2, s11, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s3, s11, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s34, s11, 0x80008 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s11, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s35, s11, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s36, s11, 0x80008 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 @@ -3281,32 +3281,32 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008 ; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s4, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s28, s4, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s30, s5, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s31, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s33, s6, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s35, s7, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s8, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s37, s8, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s9, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s39, s9, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s10, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s41, s10, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s11, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s43, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s12, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s45, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s13, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s47, s13, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s14, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s49, s14, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s15, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s51, s15, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s52, s0, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s3, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s6, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s37, s8, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s38, s8, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s39, s9, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s40, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s41, s10, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s42, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s43, s11, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s45, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s46, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s47, s13, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s48, s13, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s49, s14, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s50, s14, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s51, s15, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s52, s15, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s26, s0, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s0, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s53, s1, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s54, s1, 0x80010 @@ -3327,92 +3327,91 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_and_b32 s64, s9, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s65, s10, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s66, s11, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s67, s12, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s68, s13, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s69, s14, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s70, s15, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s50 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s49 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s47 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s46 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s11 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s44 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s42 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -3429,7 +3428,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 @@ -3455,25 +3454,25 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s36, s6, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s37, s6, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s42, s8, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s43, s9, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s44, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s45, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s46, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s47, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s48, s11, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s49, s12, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s50, s12, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s51, s13, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s52, s13, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s53, s14, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s54, s14, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s55, s15, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s56, s15, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s43, s8, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s9, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s46, s9, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s47, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s48, s10, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s49, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s50, s11, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s51, s12, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s52, s12, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s53, s13, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s54, s13, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s55, s14, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s56, s14, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s57, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s58, s15, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s24, s0, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s27, s1, 0xff @@ -3482,18 +3481,18 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s37, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s36, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s57, s6, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s58, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s59, s7, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s60, s7, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s61, s8, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s62, s9, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s42, s6, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s45, s7, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s59, s8, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s60, s8, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s61, s9, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s62, s9, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s63, s10, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s64, s11, 0xff @@ -3506,97 +3505,97 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s53 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s53 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s55 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 @@ -3610,7 +3609,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 @@ -4235,16 +4234,17 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i32 s54, s10, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s55, s10, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX6-NOHSA-NEXT: s_ashr_i32 s56, s11, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s58, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX6-NOHSA-NEXT: s_bfe_i32 s59, s12, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s61, s12 +; GFX6-NOHSA-NEXT: s_bfe_i32 s56, s11, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s58, s11 +; GFX6-NOHSA-NEXT: s_ashr_i32 s59, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s61, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX6-NOHSA-NEXT: s_ashr_i32 s62, s13, 24 ; GFX6-NOHSA-NEXT: s_bfe_i32 s63, s13, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s64, s13, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX6-NOHSA-NEXT: s_ashr_i32 s65, s14, 24 ; GFX6-NOHSA-NEXT: s_bfe_i32 s66, s14, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s67, s14, 0x80008 @@ -4253,8 +4253,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i32 s69, s15, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s70, s15, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s11, 24 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -4271,25 +4270,24 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s64 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s63 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s56 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s12 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s56 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s54 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 @@ -4380,7 +4378,8 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s40, s7, 24 +; GFX7-HSA-NEXT: s_sext_i32_i8 s40, s6 +; GFX7-HSA-NEXT: s_ashr_i32 s6, s7, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24 @@ -4411,104 +4410,103 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 +; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s9 -; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s9 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 -; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s49 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 @@ -6819,80 +6817,82 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA: ; %bb.0: ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s12, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s24, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[40:41], s[6:7], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s11, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s11, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s4, s11 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s10, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s10, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s10, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s9, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s9, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s9 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s8, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s8, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s8, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[8:9], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: @@ -6901,26 +6901,30 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s14, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s26, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s12, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s24, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[6:7], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[4:5], s[6:7], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -6929,31 +6933,27 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6961,15 +6961,15 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 @@ -6977,10 +6977,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -7390,143 +7390,144 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s7, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s19, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s20, s11, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s21, s10, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s22, s9, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s23, s8, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s24, s7, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s25, s6, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s26, s5, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff -; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff -; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff -; GFX7-HSA-NEXT: s_and_b32 s30, s8, 0xff -; GFX7-HSA-NEXT: s_and_b32 s31, s9, 0xff -; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff -; GFX7-HSA-NEXT: s_and_b32 s34, s11, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s35, s4, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s36, s5, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s12, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s21, s13, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s14, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s23, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s24, s15, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s25, s14, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s26, s13, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s27, s12, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s28, s11, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s29, s10, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s4, s9, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s2, s8, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s3, s8, 0xff +; GFX7-HSA-NEXT: s_and_b32 s5, s9, 0xff +; GFX7-HSA-NEXT: s_and_b32 s30, s10, 0xff +; GFX7-HSA-NEXT: s_and_b32 s31, s11, 0xff +; GFX7-HSA-NEXT: s_and_b32 s33, s12, 0xff +; GFX7-HSA-NEXT: s_and_b32 s34, s13, 0xff +; GFX7-HSA-NEXT: s_and_b32 s35, s14, 0xff +; GFX7-HSA-NEXT: s_and_b32 s36, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s4, s11, 0x80010 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s12, s12, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 32 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -7976,74 +7977,85 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s44, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s40, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s62, s1 -; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s64, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s66, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[50:51], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s38, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s42, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s34, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[38:39], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[54:55], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s71 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s71 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s65 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s63 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s6 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[52:53], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 @@ -8052,81 +8064,65 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -8136,33 +8132,33 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s50, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s52, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s5, 8 ; GFX7-HSA-NEXT: s_mov_b32 s62, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 ; GFX7-HSA-NEXT: s_mov_b32 s34, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s1, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 ; GFX7-HSA-NEXT: s_mov_b32 s16, s1 ; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[20:21], s[2:3], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[18:19], s[2:3], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[44:45], s[4:5], 56 ; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 @@ -8177,118 +8173,118 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s48 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s49 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 -; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s42 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s43 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s50 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s51 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s51 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s44 +; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s43 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s42 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 +; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s29 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s18 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x70 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s19 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s31 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 @@ -8308,8 +8304,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 64f1f45bf734cf..4217384cdd5ce7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -8733,4 +8733,4 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ret void ; } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 8f6a1f8c01ec34..5ce8a2b5f862e1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -4645,4 +4645,4 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index 018da7f81e3d4b..9f264de531950b 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -139,16 +139,16 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] @@ -248,14 +248,14 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -356,15 +356,15 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -464,27 +464,27 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -600,29 +600,29 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_25]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -722,6 +722,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 @@ -742,8 +743,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -759,8 +758,8 @@ body: | ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) @@ -773,7 +772,8 @@ body: | ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: S_BRANCH %bb.1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: @@ -1114,14 +1114,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -1194,12 +1186,19 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78 ; GFX908-NEXT: [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79 ; GFX908-NEXT: [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80 @@ -1216,13 +1215,14 @@ body: | ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] ; GFX908-NEXT: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc ; GFX908-NEXT: S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc @@ -1643,10 +1643,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -1719,6 +1715,10 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 @@ -2049,10 +2049,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -2125,9 +2121,13 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 @@ -2801,6 +2801,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 @@ -2822,7 +2823,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -2988,7 +2988,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -3004,9 +3003,10 @@ body: | ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) @@ -4974,20 +4974,20 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: undef [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -4998,9 +4998,9 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]] ; GFX908-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 @@ -5192,13 +5192,13 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -5297,7 +5297,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -5305,6 +5304,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -5726,17 +5726,17 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: DBG_VALUE [[V_CVT_I32_F64_e32_23]], 0, 0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: DBG_VALUE %23, 0, 0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] @@ -5836,17 +5836,17 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def %22, 327689 /* reguse:SReg_1_with_sub0 */, [[V_CVT_I32_F64_e32_4]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 851c9bb02a3456..127656f7aa626c 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -589,6 +589,6 @@ declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2 -attributes #0 = { minsize } -attributes #1 = { optsize } +attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" } +attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" } attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir index f7e295a91c8281..4b0226a0f6586b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir @@ -263,10 +263,10 @@ body: | # GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec # GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec # GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec -# GCN-NEXT: KILL %0{{$}} # GCN-NEXT: dead %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec # GCN-NEXT: dead %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec # GCN-NEXT: KILL %1{{$}} +# GCN-NEXT: KILL %0{{$}} --- name: reg_pressure diff --git a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll index 239fa80ade98a9..04f2e3235d44a7 100644 --- a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll +++ b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll @@ -12,5 +12,5 @@ define amdgpu_kernel void @impossible_occupancy() #1 { ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="9" } +attributes #0 = { "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-waves-per-eu"="9" } attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="11" } diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index bb7a591c914654..01eb1b1a353d12 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2994,71 +2994,70 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; VI-NEXT: v_mov_b32_e32 v11, 0 +; VI-NEXT: v_mov_b32_e32 v10, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_lo_u32 v10, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0 -; VI-NEXT: v_mul_lo_u32 v14, v5, v2 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 -; VI-NEXT: v_mov_b32_e32 v10, v3 -; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v14 -; VI-NEXT: v_mov_b32_e32 v10, v4 -; VI-NEXT: v_mov_b32_e32 v4, v11 -; VI-NEXT: v_mul_lo_u32 v7, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13] -; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] -; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v13 -; VI-NEXT: v_mov_b32_e32 v0, v4 -; VI-NEXT: v_mul_lo_u32 v11, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v0 -; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] -; VI-NEXT: v_add_u32_e32 v5, vcc, v11, v13 -; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc -; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; VI-NEXT: v_mul_lo_u32 v3, v4, v3 +; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; VI-NEXT: v_mul_lo_u32 v2, v5, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 +; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v5, v[2:3] +; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v6, v0, v[14:15] +; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; VI-NEXT: v_addc_u32_e64 v4, s[0:1], 0, 0, vcc +; VI-NEXT: v_mul_lo_u32 v0, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v5, v[3:4] +; VI-NEXT: v_mul_lo_u32 v1, v6, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v10 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, v3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, v4, v0, vcc +; VI-NEXT: v_mov_b32_e32 v9, v2 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v16, v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12] -; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 -; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] -; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v13, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 +; GFX9-NEXT: v_add3_u32 v9, v9, v13, v10 +; GFX9-NEXT: v_mul_lo_u32 v13, v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v0, v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] +; GFX9-NEXT: v_mul_lo_u32 v0, v7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[10:11] +; GFX9-NEXT: v_add3_u32 v0, v0, v9, v13 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc +; GFX9-NEXT: global_store_dwordx4 v12, v[2:5], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 1e9994dd8e6efd..299bbdac600917 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -73,22 +73,22 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: .LBB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GFX9-NEXT: v_add_u32_e32 v18, v9, v0 ; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v19, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v20, v3, v16 -; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13 +; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 +; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 +; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] ; GFX9-NEXT: global_load_dword v3, v[18:19], off diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 37bf8516403bf5..312dfa3717c777 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -1616,24 +1616,24 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 ; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 ; GFX9-NEXT: v_or_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13| ; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v14, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc ; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 -; GFX9-NEXT: v_add_u32_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v1, off ; GFX9-NEXT: global_store_dword v[7:8], v0, off @@ -1952,71 +1952,71 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x2070306 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v14 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v10 +; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 +; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 +; GFX9-NEXT: v_trunc_f32_e32 v18, v18 +; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| +; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v3 +; GFX9-NEXT: v_mul_f32_e32 v14, v16, v19 +; GFX9-NEXT: v_trunc_f32_e32 v14, v14 +; GFX9-NEXT: v_mad_f32 v19, -v14, v10, v16 +; GFX9-NEXT: v_mul_f32_e32 v13, v10, v13 +; GFX9-NEXT: v_trunc_f32_e32 v13, v13 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, |v10| +; GFX9-NEXT: v_mad_f32 v10, -v13, v3, v10 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v10 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v16 -; GFX9-NEXT: v_mul_f32_e32 v20, v10, v20 -; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX9-NEXT: v_mul_f32_e32 v21, v13, v21 -; GFX9-NEXT: v_trunc_f32_e32 v20, v20 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v10|, |v3| +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v16 ; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX9-NEXT: v_mul_f32_e32 v22, v16, v22 -; GFX9-NEXT: v_mul_f32_e32 v23, v19, v23 -; GFX9-NEXT: v_trunc_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v24, -v20, v3, v10 +; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX9-NEXT: v_mul_f32_e32 v3, v19, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_trunc_f32_e32 v22, v22 -; GFX9-NEXT: v_trunc_f32_e32 v23, v23 -; GFX9-NEXT: v_mad_f32 v13, -v21, v14, v13 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v3| -; GFX9-NEXT: v_xor_b32_sdwa v18, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v13, v13 +; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 +; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14 +; GFX9-NEXT: v_mad_f32 v19, -v3, v16, v19 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15 ; GFX9-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX9-NEXT: v_cvt_i32_f32_e32 v20, v20 -; GFX9-NEXT: v_cvt_i32_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v25, -v22, v10, v16 -; GFX9-NEXT: v_cvt_i32_f32_e32 v22, v22 -; GFX9-NEXT: v_mad_f32 v19, -v23, v16, v19 -; GFX9-NEXT: v_cvt_i32_f32_e32 v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| -; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 ; GFX9-NEXT: v_or_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v12, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v10| -; GFX9-NEXT: v_or_b32_e32 v18, 1, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v15, vcc +; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16| -; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v18, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v20, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v21, v3 -; GFX9-NEXT: v_add_u32_e32 v10, v22, v10 -; GFX9-NEXT: v_add_u32_e32 v12, v23, v12 -; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v13, v2 +; GFX9-NEXT: v_add_u32_e32 v12, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v13, v14, v15 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v10 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v11 -; GFX9-NEXT: v_mul_lo_u32 v4, v10, v0 -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v17 +; GFX9-NEXT: v_mul_lo_u32 v4, v12, v11 +; GFX9-NEXT: v_mul_lo_u32 v10, v13, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v17 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_e32 v3, v17, v4 -; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_e32 v4, v17, v10 +; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: global_store_dword v[7:8], v1, off @@ -2503,39 +2503,39 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16 ; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 -; GFX9-NEXT: v_mad_f32 v20, -v16, v3, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2 +; GFX9-NEXT: v_mad_f32 v2, -v16, v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2 ; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 -; GFX9-NEXT: v_mad_f32 v21, -v17, v11, v3 +; GFX9-NEXT: v_mad_f32 v19, -v17, v11, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v15, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 ; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 ; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v16, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v21|, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v17, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v16, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v17, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v18, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v18, vcc ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v10 -; GFX9-NEXT: v_mul_lo_u32 v0, v11, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v13, v12 -; GFX9-NEXT: v_sub_u32_e32 v2, v10, v2 -; GFX9-NEXT: v_sub_u32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u32 v4, v15, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v10 +; GFX9-NEXT: v_mul_lo_u32 v0, v3, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v11, v12 +; GFX9-NEXT: v_sub_u32_e32 v4, v10, v4 +; GFX9-NEXT: v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0 -; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: global_store_dword v[7:8], v1, off diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index f496a4b06bb237..81925de8910f80 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -5,8 +5,8 @@ # is killed by that store. # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 -# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) -# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 +# GCN: renamable $vgpr34_vgpr35_vgpr36_vgpr37 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) +# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr26_vgpr27_vgpr28_vgpr29, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 # GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64 # GCN-GCNTRACKER-NOT: SI_SPILL diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a2a0107a6f7d81..a1197aeace86f0 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -361,96 +361,96 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 -; GFX8-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 +; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x5000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v4 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v4 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v4 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v4 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v4 -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v4 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v4 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v4 -; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22] -; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v4 -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24] -; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v4 -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26] -; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[4:5] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffd800, v2 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe000, v2 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[19:20] ; GFX8-NEXT: s_addk_i32 s1, 0x2000 ; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(10) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(9) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v23, vcc, v13, v10 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, v14, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xfffff000, v2 +; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[21:22] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v15, v23 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v16, v24, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xfffff800, v2 +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v17, v21 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v18, v22, vcc +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v21 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v22, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v16, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v18, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v19, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v20, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v19, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v20, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v21, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v22, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v23, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v24, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v25, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v26, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v27, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v28, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v17, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v18, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -462,9 +462,9 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -496,91 +496,92 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX900-NEXT: s_movk_i32 s0, 0x5000 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: s_movk_i32 s2, 0x7f -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: s_movk_i32 s0, 0xd000 -; GFX900-NEXT: s_movk_i32 s1, 0xe000 -; GFX900-NEXT: s_movk_i32 s3, 0xf000 +; GFX900-NEXT: s_movk_i32 s4, 0x7f +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: s_movk_i32 s2, 0xd000 +; GFX900-NEXT: s_movk_i32 s3, 0xe000 +; GFX900-NEXT: s_movk_i32 s5, 0xf000 ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: s_mov_b32 s6, 0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[9:10], v[4:5], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[11:12], v[4:5], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v4 +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[2:3], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[11:12], v[2:3], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v2 ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v5, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v2 ; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v4 -; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off -; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off -; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[29:30], v[4:5], off -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX900-NEXT: s_addk_i32 s4, 0x2000 -; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(8) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(7) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v17, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v18, v3, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc +; GFX900-NEXT: s_addk_i32 s6, 0x2000 +; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21 +; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s5, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[2:3], off +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc +; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v14, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v15, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v16, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc ; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v23, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v24, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v25, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v26, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc ; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v27, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v28, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v19, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v20, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v10, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v11, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v8, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v10, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v11, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v29, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v30, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v5, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_add_i32 s4, s2, -1 -; GFX900-NEXT: s_cmp_eq_u32 s2, 0 +; GFX900-NEXT: s_add_i32 s0, s4, -1 +; GFX900-NEXT: s_cmp_eq_u32 s4, 0 ; GFX900-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_mov_b32 s2, s4 +; GFX900-NEXT: s_mov_b32 s4, s0 ; GFX900-NEXT: s_branch .LBB1_1 ; GFX900-NEXT: .LBB1_5: ; %while.end ; GFX900-NEXT: v_mov_b32_e32 v1, s35 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v6 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX900-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX900-NEXT: s_endpgm ; ; GFX10-LABEL: clmem_read: diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 6583d5e8aa5a07..704947523f677c 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -70,22 +70,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_or_b32_e32 v13, v7, v9 +; GFX9-NEXT: v_or_b32_e32 v12, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6 -; GFX9-NEXT: v_or_b32_e32 v12, v10, v8 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v8 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5] ; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] @@ -107,47 +107,47 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v13, v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 +; GFX9-NEXT: v_sub_u32_e32 v12, 64, v24 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v12, v8, v12 ; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v13, v9, v13 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v10, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v12, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v22, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v18, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, -1, v5, vcc ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -155,20 +155,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 31, v7 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v11 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] ; GFX9-NEXT: v_or_b32_e32 v14, v14, v33 -; GFX9-NEXT: v_or3_b32 v6, v6, v8, v10 +; GFX9-NEXT: v_or3_b32 v6, v6, v8, v12 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v28, v14 ; GFX9-NEXT: v_or_b32_e32 v16, v16, v32 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v29, v15, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v30, v16, vcc -; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v31, v17, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX9-NEXT: v_or_b32_e32 v12, v18, v12 +; GFX9-NEXT: v_or_b32_e32 v10, v18, v10 ; GFX9-NEXT: v_and_b32_e32 v18, v8, v23 -; GFX9-NEXT: v_or_b32_e32 v13, v19, v13 +; GFX9-NEXT: v_or_b32_e32 v11, v19, v11 ; GFX9-NEXT: v_and_b32_e32 v19, v8, v22 ; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18 ; GFX9-NEXT: v_and_b32_e32 v32, v8, v4 @@ -185,7 +185,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v9 -; GFX9-NEXT: v_or3_b32 v7, v7, 0, v11 +; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v18, v8 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -194,12 +194,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v13 -; GFX9-NEXT: v_or3_b32 v11, v7, 0, v11 -; GFX9-NEXT: v_or3_b32 v12, v6, v12, v10 -; GFX9-NEXT: v_or_b32_e32 v10, v9, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 31, v11 +; GFX9-NEXT: v_or3_b32 v10, v7, 0, v13 +; GFX9-NEXT: v_or3_b32 v12, v6, v11, v12 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v15 ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] @@ -209,19 +209,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[14:15] -; GFX9-NEXT: v_mul_lo_u32 v9, v10, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, v11, v23 +; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, v10, v23 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 ; GFX9-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v10, v[13:14] +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v11, v[13:14] ; GFX9-NEXT: v_add3_u32 v8, v8, v16, v9 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] ; GFX9-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mul_lo_u32 v12, v12, v22 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v10, v[8:9] -; GFX9-NEXT: v_add3_u32 v4, v11, v7, v12 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v11, v[8:9] +; GFX9-NEXT: v_add3_u32 v4, v10, v7, v12 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v7, v13 @@ -1628,38 +1628,38 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[10:11] ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] -; GFX9-NEXT: v_or_b32_e32 v10, v20, v10 +; GFX9-NEXT: v_or_b32_e32 v10, v20, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v17 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] +; GFX9-NEXT: v_or_b32_e32 v11, v21, v31 +; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v16, v16, v21 ; GFX9-NEXT: v_or_b32_e32 v18, v18, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v16, v16, v20 ; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v26, v16 ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v27, v17, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v28, v18, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v29, v19, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v20 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v4 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v16, v20 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v20, vcc -; GFX9-NEXT: v_and_b32_e32 v20, v30, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v20, vcc +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 +; GFX9-NEXT: v_and_b32_e32 v12, v30, v6 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v12, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v19, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc -; GFX9-NEXT: v_or_b32_e32 v11, v21, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v20, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] -; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 ; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll index a4335095115842..dc5e442c2b2622 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll +++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -8,7 +8,7 @@ ; GCN-NOT: v_writelane_b32 ; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] ; GCN: .sgpr_spill_count: 0 -define amdgpu_kernel void @test_remat_sgpr(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) { +define amdgpu_kernel void @test_remat_sgpr(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() br label %bb3 @@ -43,3 +43,5 @@ bb3: ; preds = %bb3, %bb declare double @llvm.fma.f64(double, double, double) declare i32 @llvm.amdgcn.workitem.id.x() + +attributes #0 = { "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll index 8bbae59f468f1d..cbd1714a5e375e 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -127,7 +127,7 @@ define void @test_func() !dbg !6 { ; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False -; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8 +; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 10 ; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0 @@ -146,7 +146,7 @@ define void @empty_func() !dbg !8 { ; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr ; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) +; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0 @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 { ; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr ; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144 ; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) +; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 8f4a4b5afcdc1e..554e3640221b94 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -1675,7 +1675,7 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1716,23 +1716,22 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20 -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 -; SI-GISEL-NEXT: s_nop 0 ; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1978,7 +1977,7 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0xbff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -2019,23 +2018,22 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20 -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], -1.0, v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 -; SI-GISEL-NEXT: s_nop 0 ; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2245,8 +2243,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -2254,60 +2252,60 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] -; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v17 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2520,8 +2518,8 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -2529,61 +2527,61 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] -; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0x3ff00000 -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v8 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v6 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 6d79837feb1289..6796391aba6751 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -42,7 +42,7 @@ body: | ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead %11 ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] + ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %15, 851978 /* regdef:VGPR_16 */, def %16 ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec @@ -50,8 +50,8 @@ body: | ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %21, 851978 /* regdef:VGPR_16 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B64_gfx9 undef %30:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index 268322bd074bfd..648f4fc64f9d03 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -44,9 +44,9 @@ entry: ; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64: ; TONGA: NumSgprs: 96 ; TONGA-GCNTRACKERS: NumSgprs: 96 -; TONGA: NumVgprs: 33 -; TONGA-GCNTRACKERS: NumVgprs: 25 -; TONGA: Occupancy: 7 +; TONGA: NumVgprs: 21 +; TONGA-GCNTRACKERS: NumVgprs: 23 +; TONGA: Occupancy: 8 ; TONGA-GCNTRACKERS: Occupancy: 8 @@ -59,11 +59,11 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: ; GENERIC: NumSgprs: 71 -; GENERIC-GCNTRACKERS: NumSgprs: 54 -; GENERIC: NumVgprs: 16 -; GENERIC-GCNTRACKERS: NumVgprs: 16 +; GENERIC-GCNTRACKERS: NumSgprs: 45 +; GENERIC: NumVgprs: 20 +; GENERIC-GCNTRACKERS: NumVgprs: 20 ; GENERIC: Occupancy: 7 -; GENERIC-GCNTRACKERS: Occupancy: 8 +; GENERIC-GCNTRACKERS: Occupancy: 10 define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { %load = load <64 x i16>, ptr addrspace(4) %in diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir index 9429d1565962e4..e67036f0bbbea2 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -16,20 +16,20 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub3:vreg_128 = COPY $vgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub2:vreg_128 = COPY $vgpr8 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_128 = COPY $vgpr7 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 - ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5 - ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4 - ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 + ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 + ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5 + ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4 ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3 ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub0:vreg_128 = COPY $vgpr2 ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY6]].sub2, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub3, implicit $exec ; CHECK-NEXT: S_BARRIER - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub2, implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub3, implicit $exec ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY3]].sub0, implicit $exec + ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY2]].sub1, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY1]].sub2, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY]].sub3, implicit $exec @@ -37,7 +37,7 @@ body: | ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY6]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 undef %43.sub3:vreg_128 = COPY $vgpr9 undef %42.sub2:vreg_128 = COPY $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll index bd1258cb1cf980..1e5d6755fbc85f 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll @@ -42,4 +42,4 @@ bb2: declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-num-vgpr"="9" } +attributes #1 = { "amdgpu-num-vgpr"="9" "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll index 71f8d91874f04f..5a30d5d5e42ec2 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll @@ -7,16 +7,16 @@ ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target ; CHECK-LABEL: {{^}}load_fma_store: -; OCC: NumVgprs: 32 -; OCC-GCNTRACKER: NumVgprs: 24 +; OCC: NumVgprs: 24 +; OCC-GCNTRACKER: NumVgprs: 26 ; RELAX: NumVgprs: 64 ; RELAX-GCNTRACKER: NumVgprs: 60 -; OCC: NumVGPRsForWavesPerEU: 32 -; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24 +; OCC: NumVGPRsForWavesPerEU: 24 +; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 26 ; RELAX: NumVGPRsForWavesPerEU: 64 ; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60 -; OCC: Occupancy: 8 -; OCC-GCNTRACKER: Occupancy: 8 +; OCC: Occupancy: 10 +; OCC-GCNTRACKER: Occupancy: 9 ; RELAX: Occupancy: 4 ; RELAX-GCNTRACKER: Occupancy: 4 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 6225ff73e28d08..57c54c4de71027 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -792,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: s_mov_b32 s6, s10 -; GCN-NEXT: s_mov_b32 s7, s11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s2 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NEXT: s_mov_b32 s8, s0 -; GCN-NEXT: s_mov_b32 s9, s1 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 +; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 +; GCN-NEXT: v_max_i32_e32 v5, v5, v12 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v5 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 ; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 -; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 -; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 +; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_max_i32_e32 v4, v4, v10 -; GCN-NEXT: v_max_i32_e32 v5, v5, v13 -; GCN-NEXT: v_max_i32_e32 v6, v6, v16 -; GCN-NEXT: v_max_i32_e32 v1, v1, v12 -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12 +; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GCN-NEXT: v_cvt_f32_u32_e32 v16, v6 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GCN-NEXT: v_max_i32_e32 v1, v1, v13 +; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 -; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 +; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 +; GCN-NEXT: v_max_i32_e32 v6, v6, v15 +; GCN-NEXT: v_mul_hi_u32 v16, v10, v16 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 -; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v7 +; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v12 +; GCN-NEXT: v_mul_hi_u32 v10, v1, v10 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GCN-NEXT: v_mul_hi_u32 v13, v12, v16 ; GCN-NEXT: v_max_i32_e32 v0, v0, v9 -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v11 -; GCN-NEXT: v_max_i32_e32 v2, v2, v15 -; GCN-NEXT: v_max_i32_e32 v11, v7, v17 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 -; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 -; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 -; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 -; GCN-NEXT: v_cvt_f32_u32_e32 v18, v11 -; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 -; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 -; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 -; GCN-NEXT: v_rcp_iflag_f32_e32 v18, v18 +; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; GCN-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, v16, v17 +; GCN-NEXT: v_mul_lo_u32 v13, v10, v5 ; GCN-NEXT: v_mul_hi_u32 v12, v0, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v1, v13 -; GCN-NEXT: v_mul_hi_u32 v14, v2, v14 -; GCN-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 -; GCN-NEXT: v_mul_lo_u32 v15, v12, v4 -; GCN-NEXT: v_mul_lo_u32 v17, v13, v5 -; GCN-NEXT: v_mul_lo_u32 v21, v14, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v12 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 1, v13 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v14 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v11 -; GCN-NEXT: v_sub_i32_e32 v17, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v16, vcc, v1, v5 -; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3] -; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v19, v19, v18 -; GCN-NEXT: v_sub_i32_e32 v20, vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1] -; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3] -; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 1, v14 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v13, vcc, v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; GCN-NEXT: v_mul_lo_u32 v1, v12, v4 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v5, v5, v9 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] +; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7 +; GCN-NEXT: v_mul_hi_u32 v4, v9, v5 +; GCN-NEXT: v_max_i32_e32 v5, v7, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 +; GCN-NEXT: v_max_i32_e32 v2, v2, v9 +; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3] ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 -; GCN-NEXT: v_mul_hi_u32 v4, v18, v19 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5] ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GCN-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GCN-NEXT: v_max_i32_e32 v5, v3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v5, v4 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v10 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GCN-NEXT: v_mul_lo_u32 v6, v4, v11 +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GCN-NEXT: v_mul_hi_u32 v4, v9, v10 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; GCN-NEXT: v_max_i32_e32 v6, v3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 +; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v14 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; GCN-NEXT: v_mul_lo_u32 v8, v4, v5 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, v5, v11 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s11, 0xf000 -; TONGA-NEXT: s_mov_b32 s10, -1 -; TONGA-NEXT: s_mov_b32 s6, s10 -; TONGA-NEXT: s_mov_b32 s7, s11 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s4, s2 -; TONGA-NEXT: s_mov_b32 s5, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; TONGA-NEXT: s_mov_b32 s8, s0 -; TONGA-NEXT: s_mov_b32 s9, s1 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 +; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 +; TONGA-NEXT: v_max_i32_e32 v5, v5, v12 +; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v5 ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v6 ; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 -; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 -; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 -; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 -; TONGA-NEXT: v_max_i32_e32 v6, v6, v16 -; TONGA-NEXT: v_max_i32_e32 v1, v1, v12 -; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5 +; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12 +; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 ; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 -; TONGA-NEXT: v_cvt_f32_u32_e32 v14, v5 -; TONGA-NEXT: v_cvt_f32_u32_e32 v16, v6 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; TONGA-NEXT: v_max_i32_e32 v1, v1, v13 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v16, v16 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 +; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 +; TONGA-NEXT: v_max_i32_e32 v6, v6, v15 +; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16 ; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; TONGA-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 -; TONGA-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 -; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v14 -; TONGA-NEXT: v_cvt_u32_f32_e32 v16, v16 -; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v7 +; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 +; TONGA-NEXT: v_mul_lo_u32 v16, v16, v12 +; TONGA-NEXT: v_mul_hi_u32 v10, v1, v10 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; TONGA-NEXT: v_mul_hi_u32 v13, v12, v16 ; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 -; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v11 -; TONGA-NEXT: v_max_i32_e32 v2, v2, v15 -; TONGA-NEXT: v_max_i32_e32 v11, v7, v17 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v4 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v5 -; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v6 -; TONGA-NEXT: v_mul_lo_u32 v13, v13, v12 -; TONGA-NEXT: v_mul_lo_u32 v15, v15, v14 -; TONGA-NEXT: v_mul_lo_u32 v17, v17, v16 -; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v11 -; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13 -; TONGA-NEXT: v_mul_hi_u32 v15, v14, v15 -; TONGA-NEXT: v_mul_hi_u32 v17, v16, v17 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v18, v18 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v14, v15 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v16, v17 +; TONGA-NEXT: v_mul_lo_u32 v13, v10, v5 ; TONGA-NEXT: v_mul_hi_u32 v12, v0, v12 -; TONGA-NEXT: v_mul_hi_u32 v13, v1, v13 -; TONGA-NEXT: v_mul_hi_u32 v14, v2, v14 -; TONGA-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 -; TONGA-NEXT: v_mul_lo_u32 v15, v12, v4 -; TONGA-NEXT: v_mul_lo_u32 v17, v13, v5 -; TONGA-NEXT: v_mul_lo_u32 v21, v14, v6 -; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v18 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21 -; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v12 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, 1, v13 -; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v14 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v11 -; TONGA-NEXT: v_sub_u32_e32 v17, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, v1, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3] -; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] -; TONGA-NEXT: v_mul_lo_u32 v19, v19, v18 -; TONGA-NEXT: v_sub_u32_e32 v20, vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v12 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; TONGA-NEXT: v_add_u32_e32 v17, vcc, 1, v14 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v13 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; TONGA-NEXT: v_mul_lo_u32 v1, v12, v4 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v5, v5, v9 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7 +; TONGA-NEXT: v_mul_hi_u32 v4, v9, v5 +; TONGA-NEXT: v_max_i32_e32 v5, v7, v0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 +; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 +; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 +; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3] ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 -; TONGA-NEXT: v_mul_hi_u32 v4, v18, v19 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5] ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 +; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8 +; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v3 -; TONGA-NEXT: v_max_i32_e32 v5, v3, v5 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v5, v4 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v10 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 -; TONGA-NEXT: v_mul_lo_u32 v6, v4, v11 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3 +; TONGA-NEXT: v_max_i32_e32 v6, v3, v6 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4 +; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14 +; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v5, v6 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 -; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v5, v11 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 ; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 04a824a073a7eb..459ef648fd806c 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -873,20 +873,20 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; NOSDWA-NEXT: v_mul_f16_e32 v1, v5, v1 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; NOSDWA-NEXT: v_mul_f16_e32 v0, v4, v0 -; NOSDWA-NEXT: v_mul_f16_e32 v4, v11, v10 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_mul_f16_e32 v10, v11, v10 ; NOSDWA-NEXT: v_mul_f16_e32 v7, v12, v7 ; NOSDWA-NEXT: v_mul_f16_e32 v6, v13, v6 -; NOSDWA-NEXT: v_mul_f16_e32 v5, v14, v5 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_mul_f16_e32 v4, v4, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v5 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v7 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v6 -; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v5 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4 ; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; NOSDWA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 572026da79646c..26a4a6743cffae 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1508,52 +1508,52 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 ; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 ; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v8f16: @@ -1652,81 +1652,81 @@ define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v11, v15, v11 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v9, v24, v25 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v5, v20, v21 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v15, v22, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc ; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; SI-NEXT: v_cndmask_b32_e32 v16, v7, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v16, v3, v14, vcc ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 @@ -1772,136 +1772,132 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; SI-LABEL: v_vselect_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 +; SI-NEXT: v_cndmask_b32_e32 v0, v37, v0, vcc +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 +; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[10:11] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cndmask_b32_e64 v3, v16, v3, s[12:13] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[14:15] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e64 v5, v16, v5, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31 -; SI-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[16:17] -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cndmask_b32_e32 v4, v18, v4, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 +; SI-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cndmask_b32_e32 v6, v21, v6, vcc +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_cndmask_b32_e32 v7, v22, v7, vcc +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cndmask_b32_e32 v8, v23, v8, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v23, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cndmask_b32_e32 v9, v24, v9, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v28 +; SI-NEXT: v_cndmask_b32_e32 v10, v23, v10, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v29 +; SI-NEXT: v_cndmask_b32_e32 v11, v24, v11, vcc +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 -; SI-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v14, v20, v14, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cndmask_b32_e32 v13, v20, v13, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cndmask_b32_e32 v15, v18, v15, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v16f16: @@ -1912,25 +1908,22 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v31, s30, 0 ; VI-NEXT: v_writelane_b32 v31, s31, 1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17 ; VI-NEXT: v_cmp_eq_u32_e64 s[30:31], 0, v29 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v24 +; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 ; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27 ; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[30:31] ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19 +; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25 ; VI-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[28:29] ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[10:11] -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25 -; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v20 +; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v20 ; VI-NEXT: v_cmp_eq_u32_e64 s[24:25], 0, v23 ; VI-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[26:27] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 @@ -1939,46 +1932,49 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[24:25] ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v22 +; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v22 ; VI-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[22:23] ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 ; VI-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[20:21] -; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] +; VI-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[8:9] +; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v24 +; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[10:11] +; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v26 +; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] +; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28 +; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15] +; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17] +; VI-NEXT: v_readlane_b32 s31, v31, 1 +; VI-NEXT: v_readlane_b32 s30, v31, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v26 -; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19] -; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[8:9] ; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v28 -; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v30 -; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[14:15] -; VI-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[16:17] ; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s31, v31, 1 -; VI-NEXT: v_readlane_b32 s30, v31, 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; VI-NEXT: v_cndmask_b32_e32 v8, v13, v11, vcc ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index fc6ad39db5b89f..a423b6f831a9d8 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -286,18 +286,18 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_or_b32_e32 v19, v19, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 +; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 @@ -335,18 +335,18 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_or_b32_e32 v19, v19, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 +; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 @@ -384,18 +384,18 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_or_b32_e32 v19, v19, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 +; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 6b4bca11d80c78..7e7f4f5d19914b 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -911,20 +911,20 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 +; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll index 48ba2235ae2c3e..b0258985bfa901 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll @@ -321,15 +321,15 @@ define void @v_shuffle_v2i64_v4i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index b6d305a2b0ab1f..b06739392e5075 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -295,10 +295,10 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -591,10 +591,10 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -637,7 +637,6 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v16, v14 ; GFX900-NEXT: v_mov_b32_e32 v17, v15 ; GFX900-NEXT: ;;#ASMSTART @@ -645,7 +644,8 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v18, v0 ; GFX900-NEXT: v_mov_b32_e32 v19, v1 -; GFX900-NEXT: global_store_dwordx4 v20, v[16:19], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1091,12 +1091,12 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v14 ; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1465,10 +1465,10 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1825,12 +1825,12 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v14 ; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1876,10 +1876,10 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7917,10 +7917,10 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll index 07d1437e2694ef..0b0570a328201a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll @@ -321,15 +321,15 @@ define void @v_shuffle_v2p0_v4p0__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll index f6253c4c027517..976c7b4fa704e9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v3i64_v4i64__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,16 +1582,16 @@ define void @v_shuffle_v3i64_v4i64__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2558,15 +2558,15 @@ define void @v_shuffle_v3i64_v4i64__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,19 +3418,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4283,16 +4283,16 @@ define void @v_shuffle_v3i64_v4i64__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5029,19 +5029,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5879,19 +5879,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6735,19 +6735,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7542,19 +7542,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll index ca6e625620f0b6..6c086a40c41538 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v3p0_v4p0__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,16 +1582,16 @@ define void @v_shuffle_v3p0_v4p0__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2558,15 +2558,15 @@ define void @v_shuffle_v3p0_v4p0__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,19 +3418,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4283,16 +4283,16 @@ define void @v_shuffle_v3p0_v4p0__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5029,19 +5029,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5879,19 +5879,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6735,19 +6735,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7542,19 +7542,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index e21d98b5aaf4f9..ab0dbd2f3ba421 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v4i64_v4i64__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,16 +918,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1190,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,18 +2123,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2262,17 +2262,18 @@ define void @v_shuffle_v4i64_v4i64__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2735,16 +2736,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2930,18 +2931,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3133,18 +3134,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3764,19 +3765,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5246,18 +5246,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6718,18 +6718,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6852,19 +6852,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8043,19 +8043,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8714,18 +8713,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9477,18 +9476,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10908,18 +10907,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12303,18 +12302,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index e4a0c0057a0623..8ce765abf5e829 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v4p0_v4p0__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,16 +918,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1190,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,18 +2123,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2262,17 +2262,18 @@ define void @v_shuffle_v4p0_v4p0__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2735,16 +2736,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2930,18 +2931,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3133,18 +3134,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3764,19 +3765,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5246,18 +5246,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6718,18 +6718,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6852,19 +6852,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8043,19 +8043,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8714,18 +8713,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9477,18 +9476,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10908,18 +10907,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12303,18 +12302,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 9d550ec27a63bf..8150328dd24f03 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -605,20 +605,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13 -; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11 -; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 +; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i64: @@ -631,20 +631,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3] +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] +; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7] -; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5] -; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1] -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] +; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index ce15bbcc9e189b..6423267be4b34f 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -6117,108 +6117,108 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v11 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v8 ; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v11, v8, vcc -; TONGA-NEXT: v_xor_b32_e32 v22, v9, v8 -; TONGA-NEXT: v_xor_b32_e32 v11, v11, v8 -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v22 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v11 -; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v22 -; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v11, vcc -; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 -; TONGA-NEXT: v_rcp_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; TONGA-NEXT: v_trunc_f32_e32 v9, v9 -; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 -; TONGA-NEXT: v_cvt_u32_f32_e32 v20, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v21, v8 -; TONGA-NEXT: v_mul_lo_u32 v18, v23, v20 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v23, v21, 0 -; TONGA-NEXT: v_mul_lo_u32 v19, v24, v21 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v18 -; TONGA-NEXT: v_add_u32_e32 v25, vcc, v9, v19 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v21, v25, 0 -; TONGA-NEXT: v_mul_hi_u32 v9, v21, v8 -; TONGA-NEXT: v_add_u32_e32 v26, vcc, v9, v18 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v20, v8, 0 -; TONGA-NEXT: v_addc_u32_e32 v27, vcc, 0, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v25, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v26, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v27, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v18 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v25, vcc, v21, v8 -; TONGA-NEXT: v_addc_u32_e32 v26, vcc, v20, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v23, v25, 0 -; TONGA-NEXT: v_mul_lo_u32 v20, v23, v26 -; TONGA-NEXT: v_mul_lo_u32 v21, v24, v25 -; TONGA-NEXT: v_mul_hi_u32 v23, v25, v8 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v26, v8, 0 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v20, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v21 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v9, 0 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v26, v9, 0 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v23, v20 -; TONGA-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; TONGA-NEXT: v_xor_b32_e32 v9, v9, v8 +; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8 +; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v9 +; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v8 +; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v9 +; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v8, vcc +; TONGA-NEXT: v_madmk_f32 v11, v18, 0x4f800000, v11 +; TONGA-NEXT: v_rcp_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; TONGA-NEXT: v_mul_f32_e32 v18, 0x2f800000, v11 +; TONGA-NEXT: v_trunc_f32_e32 v18, v18 +; TONGA-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11 +; TONGA-NEXT: v_cvt_u32_f32_e32 v22, v18 +; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_lo_u32 v20, v23, v22 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v21, v24, v11 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v20 +; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21 +; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0 +; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18 +; TONGA-NEXT: v_add_u32_e32 v25, vcc, v25, v19 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0 +; TONGA-NEXT: v_addc_u32_e32 v26, vcc, 0, v20, vcc +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v18 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v26, v19, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v20 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v22, v19, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v22, v23, v25 +; TONGA-NEXT: v_mul_lo_u32 v23, v24, v11 +; TONGA-NEXT: v_mul_hi_u32 v24, v11, v18 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v22, v19 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v23 +; TONGA-NEXT: v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0 +; TONGA-NEXT: v_add_u32_e32 v22, vcc, v24, v22 +; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; TONGA-NEXT: v_add_u32_e32 v20, vcc, v22, v20 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v23, v21, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; TONGA-NEXT: v_add_u32_e32 v18, vcc, v20, v18 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v21, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v18, v8 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v8 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, v26, v9, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v20, 31, v15 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v14, v20 -; TONGA-NEXT: v_xor_b32_e32 v21, v8, v20 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v21, v19, 0 -; TONGA-NEXT: v_mul_hi_u32 v23, v21, v18 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v20, vcc -; TONGA-NEXT: v_xor_b32_e32 v15, v15, v20 -; TONGA-NEXT: v_add_u32_e32 v23, vcc, v23, v8 -; TONGA-NEXT: v_addc_u32_e32 v24, vcc, 0, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v18, 0 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v19, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v23, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v24, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v8, v18 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; TONGA-NEXT: v_mul_lo_u32 v19, v22, v8 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v22, v18, 0 -; TONGA-NEXT: v_mul_lo_u32 v18, v11, v18 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v19, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v18, v9 -; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v15, v9 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v21, v8 -; TONGA-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, vcc -; TONGA-NEXT: v_sub_u32_e64 v19, s[0:1], v8, v22 -; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v18, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v11 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v25, v19, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v22 +; TONGA-NEXT: v_xor_b32_e32 v23, v18, v22 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0 +; TONGA-NEXT: v_mul_hi_u32 v21, v23, v11 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v22, vcc +; TONGA-NEXT: v_xor_b32_e32 v15, v15, v22 +; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v18 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v19, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v24, v18 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v19, vcc +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v20 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; TONGA-NEXT: v_mul_lo_u32 v20, v9, v18 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v11, v8, v11 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 +; TONGA-NEXT: v_sub_u32_e32 v19, vcc, v15, v11 +; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v23, v18 +; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, vcc +; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v18, v9 +; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v23, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v22 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v9 +; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v15, v11, vcc ; TONGA-NEXT: v_cndmask_b32_e64 v24, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v11 -; TONGA-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, s[0:1] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v8 +; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v23, v23, v24, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v19, v22 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v15, v9, vcc -; TONGA-NEXT: v_subbrev_u32_e64 v18, s[0:1], 0, v18, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v9, v11 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 +; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v20, v9 ; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v22 -; TONGA-NEXT: v_cndmask_b32_e64 v18, v21, v18, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v21, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v9, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v11, v15, v21, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v19, v19, v24, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v19, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v9, v9, v18, vcc -; TONGA-NEXT: v_xor_b32_e32 v8, v8, v20 -; TONGA-NEXT: v_xor_b32_e32 v9, v9, v20 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v8, v20 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v9, v20, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v18, v9 +; TONGA-NEXT: v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 +; TONGA-NEXT: v_cndmask_b32_e32 v8, v15, v9, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v24, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e32 v9, v18, v20, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v8, v11, v19, vcc +; TONGA-NEXT: v_xor_b32_e32 v9, v9, v22 +; TONGA-NEXT: v_xor_b32_e32 v11, v8, v22 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v22 +; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v22, vcc ; TONGA-NEXT: s_cbranch_execnz .LBB12_3 ; TONGA-NEXT: .LBB12_2: ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v10 @@ -8991,33 +8991,33 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: s_waitcnt vmcnt(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 ; TONGA-NEXT: v_lshrrev_b32_e32 v12, 30, v12 -; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, v0, v12 -; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13 ; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 +; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 +; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13 +; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v2, v13 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v5 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v2, v13 -; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc ; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v4, v14 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 +; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14 ; TONGA-NEXT: v_lshrrev_b32_e32 v15, 30, v15 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v5, vcc -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v6, v15 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc +; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; TONGA-NEXT: v_add_u32_e64 v12, s[0:1], v4, v14 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v6, v15 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc ; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 +; TONGA-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v5, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 ; TONGA-NEXT: v_and_b32_e32 v13, -4, v13 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v13 -; TONGA-NEXT: v_and_b32_e32 v14, -4, v14 -; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v17, vcc -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 -; TONGA-NEXT: v_and_b32_e32 v15, -4, v15 -; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v18, vcc -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v15 -; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v19, vcc +; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc ; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; TONGA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 29488579c15537..a9b1f7e888567f 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -266,20 +266,20 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 6ed19bd6d764b8..30a0a26ca173e0 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -776,14 +776,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20 +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 @@ -895,14 +895,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20 +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 7c310477dd838f..530226baa775e1 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -862,43 +862,43 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 -; GCN-NEXT: v_mul_lo_u32 v20, v13, v3 +; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 ; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 ; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 ; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 -; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v20 +; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v19 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 -; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12 -; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13 +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 -; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0 +; GCN-NEXT: v_sub_u32_e32 v18, vcc, v6, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] ; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_sub_u32_e32 v16, vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] -; GCN-NEXT: v_sub_u32_e32 v17, vcc, v9, v3 -; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v8, v2 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v9, v3 +; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v18, s[0:1] +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5] -; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7] -; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[4:5] +; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12 +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index d1d8240a1007a2..883657547519ba 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: BitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index ad6e92a25b8615..278bf086d6088b 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: BitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 3eff89239d5418..89d831b51f6947 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -153,7 +153,7 @@ body: | # FULL-NEXT: fp64-fp16-input-denormals: true # FULL-NEXT: fp64-fp16-output-denormals: true # FULL-NEXT: highBitsOf32BitAddress: 0 -# FULL-NEXT: occupancy: 8 +# FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' @@ -175,7 +175,7 @@ body: | # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } # SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } -# SIMPLE-NEXT: occupancy: 8 +# SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: no_mfi @@ -229,7 +229,7 @@ body: | # FULL-NEXT: fp64-fp16-input-denormals: true # FULL-NEXT: fp64-fp16-output-denormals: true # FULL-NEXT: highBitsOf32BitAddress: 0 -# FULL-NEXT: occupancy: 8 +# FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' @@ -251,7 +251,7 @@ body: | # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } # SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } -# SIMPLE-NEXT: occupancy: 8 +# SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: empty_mfi @@ -306,7 +306,7 @@ body: | # FULL-NEXT: fp64-fp16-input-denormals: true # FULL-NEXT: fp64-fp16-output-denormals: true # FULL-NEXT: highBitsOf32BitAddress: 0 -# FULL-NEXT: occupancy: 8 +# FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' @@ -329,7 +329,7 @@ body: | # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } # SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } -# SIMPLE-NEXT: occupancy: 8 +# SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: empty_mfi_entry_func @@ -457,11 +457,11 @@ body: | ... --- -# ALL-LABEL: name: occupancy_0 -# ALL: occupancy: 8 -name: occupancy_0 +# ALL-LABEL: name: occupancy_10 +# ALL: occupancy: 10 +name: occupancy_10 machineFunctionInfo: - occupancy: 0 + occupancy: 10 body: | bb.0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index eca3f99b64955b..ec56de11b250a4 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -167,7 +167,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' @@ -220,7 +220,7 @@ define void @function() { ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' From c3b40c7ea215487ffc3b9d146f3f8f9a7ac8d407 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 23 Jan 2025 16:14:23 +0100 Subject: [PATCH 151/208] [X86] Regenerate test checks (NFC) Regenerate some tests for the new vpternlog printing. --- llvm/test/CodeGen/X86/uadd_sat_vec.ll | 24 ++++++++++++------------ llvm/test/CodeGen/X86/usub_sat_vec.ll | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll index 50c73009314a9d..1ff95c876a6b17 100644 --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -587,7 +587,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -596,7 +596,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512BW-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512BW-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -655,7 +655,7 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -664,7 +664,7 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX512BW-LABEL: v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512BW-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512BW-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -747,7 +747,7 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; AVX512F-NEXT: vpminud %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -755,7 +755,7 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX512BW-LABEL: v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512BW-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm2 +; AVX512BW-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2 ; AVX512BW-NEXT: vpminud %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -885,7 +885,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX512-LABEL: v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; AVX512-NEXT: vpminud %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq @@ -938,7 +938,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -947,7 +947,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512BW-LABEL: v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512BW-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512BW-NEXT: vpminuq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -1022,7 +1022,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1030,7 +1030,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX512BW-LABEL: v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512BW-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm2 +; AVX512BW-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2 ; AVX512BW-NEXT: vpminuq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1149,7 +1149,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX512-LABEL: v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll index 73e90fe77bca28..34eb30dfebeeb2 100644 --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -543,7 +543,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; ; AVX512BW-LABEL: v16i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpternlogd $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ mem) ; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z From f61d93ffc456d94df729529642ea180b40ef9d19 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg Date: Thu, 23 Jan 2025 10:23:50 -0500 Subject: [PATCH 152/208] [Flang] Generate math.acos op for non-precise acos intrinsic calls (#123641) This patch changes the codgegn for non-precise acos calls to generate math.acos ops. This wasn't done before because the math dialect did not have a acos operation at the time. --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 6 ++- .../test/Lower/HLFIR/elemental-intrinsics.f90 | 2 +- flang/test/Lower/Intrinsics/acos.f90 | 36 ++++++++++------- flang/test/Lower/dummy-procedure.f90 | 2 +- flang/test/Lower/trigonometric-intrinsics.f90 | 40 +++++++++++++++++++ 5 files changed, 67 insertions(+), 19 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 63c013dda95e64..db9918c265164d 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1027,8 +1027,10 @@ static constexpr MathOperation mathOperations[] = { {"abs", "cabs", genFuncType, Ty::Complex<8>>, genComplexMathOp}, {"abs", RTNAME_STRING(CAbsF128), FuncTypeReal16Complex16, genLibF128Call}, - {"acos", "acosf", genFuncType, Ty::Real<4>>, genLibCall}, - {"acos", "acos", genFuncType, Ty::Real<8>>, genLibCall}, + {"acos", "acosf", genFuncType, Ty::Real<4>>, + genMathOp}, + {"acos", "acos", genFuncType, Ty::Real<8>>, + genMathOp}, {"acos", RTNAME_STRING(AcosF128), FuncTypeReal16Real16, genLibF128Call}, {"acos", "cacosf", genFuncType, Ty::Complex<4>>, genLibCall}, {"acos", "cacos", genFuncType, Ty::Complex<8>>, genLibCall}, diff --git a/flang/test/Lower/HLFIR/elemental-intrinsics.f90 b/flang/test/Lower/HLFIR/elemental-intrinsics.f90 index dd79688663cba4..689f0a08ca7ab5 100644 --- a/flang/test/Lower/HLFIR/elemental-intrinsics.f90 +++ b/flang/test/Lower/HLFIR/elemental-intrinsics.f90 @@ -15,7 +15,7 @@ subroutine simple_elemental(x,y) ! CHECK: ^bb0(%[[VAL_9:.*]]: index): ! CHECK: %[[VAL_10:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_9]]) : (!fir.ref>, index) -> !fir.ref ! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref -! CHECK: %[[VAL_12:.*]] = fir.call @acosf(%[[VAL_11]]) fastmath : (f32) -> f32 +! CHECK: %[[VAL_12:.*]] = math.acos %[[VAL_11]] fastmath : f32 ! CHECK: hlfir.yield_element %[[VAL_12]] : f32 ! CHECK: } ! CHECK: hlfir.assign diff --git a/flang/test/Lower/Intrinsics/acos.f90 b/flang/test/Lower/Intrinsics/acos.f90 index d2ef8e1cef0a88..849d36ad1323b8 100644 --- a/flang/test/Lower/Intrinsics/acos.f90 +++ b/flang/test/Lower/Intrinsics/acos.f90 @@ -1,9 +1,9 @@ -! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s +! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s +! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s function test_real4(x) real :: x, test_real4 @@ -11,15 +11,9 @@ function test_real4(x) end function ! ALL-LABEL: @_QPtest_real4 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @acosf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 - -function test_real8(x) - real(8) :: x, test_real8 - test_real8 = acos(x) -end function - -! ALL-LABEL: @_QPtest_real8 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @acos({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 +! FAST: {{%[A-Za-z0-9._]+}} = math.acos {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.acos {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @acosf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 function test_complex4(x) complex :: x, test_complex4 @@ -37,3 +31,15 @@ function test_complex8(x) ! ALL-LABEL: @_QPtest_complex8 ! ALL: {{%[A-Za-z0-9._]+}} = fir.call @cacos({{%[A-Za-z0-9._]+}}) {{.*}}: (complex) -> complex +function test_real8(x) + real(8) :: x, test_real8 + test_real8 = acos(x) +end function + +! ALL-LABEL: @_QPtest_real8 +! FAST: {{%[A-Za-z0-9._]+}} = math.acos {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.acos {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @acos({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 + +! PRECISE-DAG: func.func private @acosf(f32) -> f32 attributes {fir.bindc_name = "acosf", fir.runtime} +! PRECISE-DAG: func.func private @acos(f64) -> f64 attributes {fir.bindc_name = "acos", fir.runtime} diff --git a/flang/test/Lower/dummy-procedure.f90 b/flang/test/Lower/dummy-procedure.f90 index 6874e8eca90b94..a84c351b1166b2 100644 --- a/flang/test/Lower/dummy-procedure.f90 +++ b/flang/test/Lower/dummy-procedure.f90 @@ -154,7 +154,7 @@ subroutine todo3(dummy_proc) ! CHECK-LABEL: func private @fir.acos.f32.ref_f32(%arg0: !fir.ref) -> f32 !CHECK: %[[load:.*]] = fir.load %arg0 - !CHECK: %[[res:.*]] = fir.call @acosf(%[[load]]) fastmath : (f32) -> f32 + !CHECK: %[[res:.*]] = math.acos %[[load]] fastmath : f32 !CHECK: return %[[res]] : f32 ! CHECK-LABEL: func private @fir.atan2.f32.ref_f32.ref_f32( diff --git a/flang/test/Lower/trigonometric-intrinsics.f90 b/flang/test/Lower/trigonometric-intrinsics.f90 index 731ec6bbf68557..d1edd4ef48dc30 100644 --- a/flang/test/Lower/trigonometric-intrinsics.f90 +++ b/flang/test/Lower/trigonometric-intrinsics.f90 @@ -87,6 +87,34 @@ subroutine cos_testcd(z) z = cos(z) end subroutine +! CHECK-LABEL: acos_testr +subroutine acos_testr(a, b) + real :: a, b +! CHECK: fir.call @fir.acos.contract.f32.f32 + b = acos(a) +end subroutine + +! CHECK-LABEL: acos_testd +subroutine acos_testd(a, b) + real(kind=8) :: a, b +! CHECK: fir.call @fir.acos.contract.f64.f64 + b = acos(a) +end subroutine + +! CHECK-LABEL: acos_testc +subroutine acos_testc(z) + complex :: z +! CHECK: fir.call @fir.acos.contract.z32.z32 + z = acos(z) +end subroutine + +! CHECK-LABEL: acos_testcd +subroutine acos_testcd(z) + complex(kind=8) :: z +! CHECK: fir.call @fir.acos.contract.z64.z64 + z = acos(z) +end subroutine + ! CHECK-LABEL: cosh_testr subroutine cosh_testr(a, b) real :: a, b @@ -211,6 +239,18 @@ subroutine sinh_testcd(z) ! CMPLX-FAST: complex.cos %{{.*}} : complex ! CMPLX-PRECISE: fir.call @ccos +! CHECK-LABEL: @fir.acos.contract.f32.f32 +! CHECK: math.acos {{.*}} : f32 + +! CHECK-LABEL: @fir.acos.contract.f64.f64 +! CHECK: math.acos {{.*}} : f64 + +! CHECK-LABEL: @fir.acos.contract.z32.z32 +! CHECK: fir.call @cacosf + +! CHECK-LABEL: @fir.acos.contract.z64.z64 +! CHECK: fir.call @cacos + ! CHECK-LABEL: @fir.cosh.contract.f32.f32 ! CHECK: math.cosh {{.*}} : f32 From 6fe0fc60341b05bf30ccc16012dab9eeb55a338d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 23 Jan 2025 16:33:49 +0100 Subject: [PATCH 153/208] [CallingConv] Return ArrayRef from AllocateRegBlock() (NFC) (#124120) Instead of returning the first register, return the ArrayRef containing the whole block. Existing users rely on the fact that the register block only contains adjacently-numbered registers and it's possible to get the remaining registers in the block by just incrementing the register. Returning an ArrayRef allows more generic usage with non-adjacent registers. --- llvm/include/llvm/CodeGen/CallingConvLower.h | 15 ++++++++------- .../AArch64/AArch64CallingConvention.cpp | 18 +++++++++--------- llvm/lib/Target/ARM/ARMCallingConv.cpp | 10 +++++----- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h index 85171138d1eb9d..7ad27cd01336a6 100644 --- a/llvm/include/llvm/CodeGen/CallingConvLower.h +++ b/llvm/include/llvm/CodeGen/CallingConvLower.h @@ -357,12 +357,13 @@ class CCState { return Reg; } - /// AllocateRegBlock - Attempt to allocate a block of RegsRequired consecutive - /// registers. If this is not possible, return zero. Otherwise, return the first - /// register of the block that were allocated, marking the entire block as allocated. - MCPhysReg AllocateRegBlock(ArrayRef Regs, unsigned RegsRequired) { + /// Attempt to allocate a block of RegsRequired consecutive registers. + /// If this is not possible, return an empty range. Otherwise, return a + /// range of consecutive registers, marking the entire block as allocated. + ArrayRef AllocateRegBlock(ArrayRef Regs, + unsigned RegsRequired) { if (RegsRequired > Regs.size()) - return 0; + return {}; for (unsigned StartIdx = 0; StartIdx <= Regs.size() - RegsRequired; ++StartIdx) { @@ -379,11 +380,11 @@ class CCState { for (unsigned BlockIdx = 0; BlockIdx < RegsRequired; ++BlockIdx) { MarkAllocated(Regs[StartIdx + BlockIdx]); } - return Regs[StartIdx]; + return Regs.slice(StartIdx, RegsRequired); } } // No block was available - return 0; + return {}; } /// Version of AllocateReg with list of registers to be shadowed. diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp index fa04ccfba30f06..991d710c979b9e 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -176,27 +176,27 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, // [N x i32] arguments get packed into x-registers on Darwin's arm64_32 // because that's how the armv7k Clang front-end emits small structs. unsigned EltsPerReg = (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32) ? 2 : 1; - unsigned RegResult = State.AllocateRegBlock( + ArrayRef RegResult = State.AllocateRegBlock( RegList, alignTo(PendingMembers.size(), EltsPerReg) / EltsPerReg); - if (RegResult && EltsPerReg == 1) { - for (auto &It : PendingMembers) { - It.convertToReg(RegResult); + if (!RegResult.empty() && EltsPerReg == 1) { + for (const auto &[It, Reg] : zip(PendingMembers, RegResult)) { + It.convertToReg(Reg); State.addLoc(It); - ++RegResult; } PendingMembers.clear(); return true; - } else if (RegResult) { + } else if (!RegResult.empty()) { assert(EltsPerReg == 2 && "unexpected ABI"); bool UseHigh = false; CCValAssign::LocInfo Info; + unsigned RegIdx = 0; for (auto &It : PendingMembers) { Info = UseHigh ? CCValAssign::AExtUpper : CCValAssign::ZExt; - State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, RegResult, - MVT::i64, Info)); + State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, + RegResult[RegIdx], MVT::i64, Info)); UseHigh = !UseHigh; if (!UseHigh) - ++RegResult; + ++RegIdx; } PendingMembers.clear(); return true; diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp index 5a88fff41aeb1d..66a76a8c7a95a1 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.cpp +++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp @@ -228,12 +228,12 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, break; } - unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); - if (RegResult) { - for (CCValAssign &PendingMember : PendingMembers) { - PendingMember.convertToReg(RegResult); + ArrayRef RegResult = + State.AllocateRegBlock(RegList, PendingMembers.size()); + if (!RegResult.empty()) { + for (const auto &[PendingMember, Reg] : zip(PendingMembers, RegResult)) { + PendingMember.convertToReg(Reg); State.addLoc(PendingMember); - ++RegResult; } PendingMembers.clear(); return true; From e1aa1e43decf9275175845bea970ef6d7c2b1af6 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Thu, 23 Jan 2025 07:52:30 -0800 Subject: [PATCH 154/208] [WPD]Provide branch weight for checking mode. (#124084) Checking mode aims to help diagnose and confirm undefined behavior. In most cases, source code don't cast pointers between unrelated types for virtual calls, so we expect direct calls in the frequent branch and debug trap in the unlikely branch. This way, the overhead of checking mode is not higher than an indirect call promotion for a hot callsite as long as the callsite doesn't run the debug trap branch. --- llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp | 5 +++-- llvm/test/ThinLTO/X86/devirt_check.ll | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index e889926930082f..30e935ea663f34 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1225,8 +1225,9 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, // perform a debug trap. if (DevirtCheckMode == WPDCheckMode::Trap) { auto *Cond = Builder.CreateICmpNE(CB.getCalledOperand(), Callee); - Instruction *ThenTerm = - SplitBlockAndInsertIfThen(Cond, &CB, /*Unreachable=*/false); + Instruction *ThenTerm = SplitBlockAndInsertIfThen( + Cond, &CB, /*Unreachable=*/false, + MDBuilder(M.getContext()).createUnlikelyBranchWeights()); Builder.SetInsertPoint(ThenTerm); Function *TrapFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap); diff --git a/llvm/test/ThinLTO/X86/devirt_check.ll b/llvm/test/ThinLTO/X86/devirt_check.ll index 74f1dfd6ac012a..4a9a7a4547fd6c 100644 --- a/llvm/test/ThinLTO/X86/devirt_check.ll +++ b/llvm/test/ThinLTO/X86/devirt_check.ll @@ -58,7 +58,7 @@ entry: ; Ensure !prof and !callees metadata for indirect call promotion removed. ; TRAP-NOT: prof ; TRAP-NOT: callees - ; TRAP: br i1 %.not, label %1, label %0 + ; TRAP: br i1 %.not, label %1, label %0, !prof ![[PROF:[0-9]+]] ; TRAP: 0: ; TRAP: tail call void @llvm.debugtrap() ; TRAP: br label %1 @@ -89,6 +89,8 @@ entry: ; CHECK-LABEL: ret i32 ; CHECK-LABEL: } +; TRAP: ![[PROF]] = !{!"branch_weights", i32 1048575, i32 1} + declare i1 @llvm.type.test(i8*, metadata) declare void @llvm.assume(i1) From 96410edd4748a78e6b736eef8a5ff1ca4bb29be5 Mon Sep 17 00:00:00 2001 From: mingmingl Date: Thu, 23 Jan 2025 08:07:48 -0800 Subject: [PATCH 155/208] mark test as unsupported as I investigate test failure on certain environments --- llvm/test/CodeGen/X86/jump-table-partition.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/jump-table-partition.ll b/llvm/test/CodeGen/X86/jump-table-partition.ll index e4f8d90baafdb7..6a6aa00fa7fa0a 100644 --- a/llvm/test/CodeGen/X86/jump-table-partition.ll +++ b/llvm/test/CodeGen/X86/jump-table-partition.ll @@ -1,8 +1,8 @@ ; -stats requires asserts ; requires: asserts -; COM: Investigate test failure with fuchsia environment and re-enable the test. -; UNSUPPORTED: target={{.*}}-fuchsia +; COM: Fix test failures on certain environments and re-enable the test. +; UNSUPPORTED: target={{.*}} ; Stop after 'finalize-isel' for simpler MIR, and lower the minimum number of ; jump table entries so 'switch' needs fewer cases to generate a jump table. From cb426b18c2f683ed3b4be325f257d62976d22f00 Mon Sep 17 00:00:00 2001 From: Finn Plummer <50529406+inbelic@users.noreply.github.com> Date: Thu, 23 Jan 2025 08:16:45 -0800 Subject: [PATCH 156/208] [NFC][DirectX] Clean-up of `DXIL.td` (#124005) - Runs clang-format on `DXIL.td` Note: this does not include the suggested formatting changes to `defset list OpClasses` as it does not enforce the formatting that is primarily used elsewhere - Reorders currently defined `DXIL` ops by opcode in ascending order to be consistent with other definitions This is a small cleanup moved to be separate from [#115912](https://github.com/llvm/llvm-project/issues/115912) for reviewability. --- llvm/lib/Target/DirectX/DXIL.td | 441 ++++++++++++++++---------------- 1 file changed, 220 insertions(+), 221 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index beb9b56dba30aa..d099bb395449da 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -235,7 +235,7 @@ defset list OpClasses = { def writeSamplerFeedback : DXILOpClass; def writeSamplerFeedbackBias : DXILOpClass; def writeSamplerFeedbackGrad : DXILOpClass; - def writeSamplerFeedbackLevel: DXILOpClass; + def writeSamplerFeedbackLevel : DXILOpClass; // This is a sentinel definition. Hence placed at the end here and // not as part of the above alphabetically sorted valid definitions. @@ -307,34 +307,35 @@ class Attributes attrs> { list fn_attrs = attrs; } -defvar BarrierMode_DeviceMemoryBarrier = 2; +defvar BarrierMode_DeviceMemoryBarrier = 2; defvar BarrierMode_DeviceMemoryBarrierWithGroupSync = 3; -defvar BarrierMode_GroupMemoryBarrier = 8; -defvar BarrierMode_GroupMemoryBarrierWithGroupSync = 9; -defvar BarrierMode_AllMemoryBarrier = 10; -defvar BarrierMode_AllMemoryBarrierWithGroupSync = 11; +defvar BarrierMode_GroupMemoryBarrier = 8; +defvar BarrierMode_GroupMemoryBarrierWithGroupSync = 9; +defvar BarrierMode_AllMemoryBarrier = 10; +defvar BarrierMode_AllMemoryBarrierWithGroupSync = 11; -defvar WaveOpKind_Sum = 0; +defvar WaveOpKind_Sum = 0; defvar WaveOpKind_Product = 1; -defvar WaveOpKind_Min = 2; -defvar WaveOpKind_Max = 3; +defvar WaveOpKind_Min = 2; +defvar WaveOpKind_Max = 3; -defvar SignedOpKind_Signed = 0; +defvar SignedOpKind_Signed = 0; defvar SignedOpKind_Unsigned = 1; // Intrinsic arg selection class IntrinArgSelectType; def IntrinArgSelect_Index : IntrinArgSelectType; -def IntrinArgSelect_I8 : IntrinArgSelectType; -def IntrinArgSelect_I32 : IntrinArgSelectType; +def IntrinArgSelect_I8 : IntrinArgSelectType; +def IntrinArgSelect_I32 : IntrinArgSelectType; class IntrinArgSelect { IntrinArgSelectType type = type_; int value = value_; } + class IntrinArgIndex : IntrinArgSelect; -class IntrinArgI8 : IntrinArgSelect; -class IntrinArgI32 : IntrinArgSelect; +class IntrinArgI8 : IntrinArgSelect; +class IntrinArgI32 : IntrinArgSelect; // Select which intrinsic to lower from for a DXILOp. // If the intrinsic is the only argument given to IntrinSelect, then the @@ -364,7 +365,8 @@ class IntrinArgI32 : IntrinArgSelect; // >, // ] // -class IntrinSelect arg_selects_=[]> { +class IntrinSelect arg_selects_ = []> { Intrinsic intrinsic = intrinsic_; list arg_selects = arg_selects_; } @@ -403,10 +405,12 @@ class DXILOp { } // Concrete definitions of DXIL Operations +// +// This are sorted by ascending value of the DXIL Opcodes -def Abs : DXILOp<6, unary> { +def Abs : DXILOp<6, unary> { let Doc = "Returns the absolute value of the input."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -414,9 +418,10 @@ def Abs : DXILOp<6, unary> { let attributes = [Attributes]; } -def Saturate : DXILOp<7, unary> { - let Doc = "Clamps a single or double precision floating point value to [0.0f...1.0f]."; - let intrinsics = [ IntrinSelect ]; +def Saturate : DXILOp<7, unary> { + let Doc = "Clamps a single or double precision floating point value to " + "[0.0f...1.0f]."; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -424,9 +429,9 @@ def Saturate : DXILOp<7, unary> { let attributes = [Attributes]; } -def IsInf : DXILOp<9, isSpecialFloat> { +def IsInf : DXILOp<9, isSpecialFloat> { let Doc = "Determines if the specified value is infinite."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = Int1Ty; let overloads = [Overloads]; @@ -434,9 +439,9 @@ def IsInf : DXILOp<9, isSpecialFloat> { let attributes = [Attributes]; } -def Cos : DXILOp<12, unary> { +def Cos : DXILOp<12, unary> { let Doc = "Returns cosine(theta) for theta in radians."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -444,9 +449,9 @@ def Cos : DXILOp<12, unary> { let attributes = [Attributes]; } -def Sin : DXILOp<13, unary> { +def Sin : DXILOp<13, unary> { let Doc = "Returns sine(theta) for theta in radians."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -454,9 +459,9 @@ def Sin : DXILOp<13, unary> { let attributes = [Attributes]; } -def Tan : DXILOp<14, unary> { +def Tan : DXILOp<14, unary> { let Doc = "Returns tangent(theta) for theta in radians."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -464,9 +469,9 @@ def Tan : DXILOp<14, unary> { let attributes = [Attributes]; } -def ACos : DXILOp<15, unary> { +def ACos : DXILOp<15, unary> { let Doc = "Returns the arccosine of the specified value."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -474,9 +479,9 @@ def ACos : DXILOp<15, unary> { let attributes = [Attributes]; } -def ASin : DXILOp<16, unary> { +def ASin : DXILOp<16, unary> { let Doc = "Returns the arcsine of the specified value."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -484,9 +489,9 @@ def ASin : DXILOp<16, unary> { let attributes = [Attributes]; } -def ATan : DXILOp<17, unary> { +def ATan : DXILOp<17, unary> { let Doc = "Returns the arctangent of the specified value."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -494,9 +499,9 @@ def ATan : DXILOp<17, unary> { let attributes = [Attributes]; } -def HCos : DXILOp<18, unary> { +def HCos : DXILOp<18, unary> { let Doc = "Returns the hyperbolic cosine of the specified value."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -504,9 +509,9 @@ def HCos : DXILOp<18, unary> { let attributes = [Attributes]; } -def HSin : DXILOp<19, unary> { +def HSin : DXILOp<19, unary> { let Doc = "Returns the hyperbolic sine of the specified value."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -514,9 +519,9 @@ def HSin : DXILOp<19, unary> { let attributes = [Attributes]; } -def HTan : DXILOp<20, unary> { +def HTan : DXILOp<20, unary> { let Doc = "Returns the hyperbolic tan of the specified value."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -524,10 +529,10 @@ def HTan : DXILOp<20, unary> { let attributes = [Attributes]; } -def Exp2 : DXILOp<21, unary> { +def Exp2 : DXILOp<21, unary> { let Doc = "Returns the base 2 exponential, or 2**x, of the specified value. " "exp2(x) = 2**x."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -535,10 +540,10 @@ def Exp2 : DXILOp<21, unary> { let attributes = [Attributes]; } -def Frac : DXILOp<22, unary> { +def Frac : DXILOp<22, unary> { let Doc = "Returns a fraction from 0 to 1 that represents the decimal part " "of the input."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -546,9 +551,9 @@ def Frac : DXILOp<22, unary> { let attributes = [Attributes]; } -def Log2 : DXILOp<23, unary> { +def Log2 : DXILOp<23, unary> { let Doc = "Returns the base-2 logarithm of the specified value."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -556,10 +561,10 @@ def Log2 : DXILOp<23, unary> { let attributes = [Attributes]; } -def Sqrt : DXILOp<24, unary> { +def Sqrt : DXILOp<24, unary> { let Doc = "Returns the square root of the specified floating-point value, " "per component."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -567,10 +572,10 @@ def Sqrt : DXILOp<24, unary> { let attributes = [Attributes]; } -def RSqrt : DXILOp<25, unary> { +def RSqrt : DXILOp<25, unary> { let Doc = "Returns the reciprocal of the square root of the specified value. " "rsqrt(x) = 1 / sqrt(x)."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -578,10 +583,10 @@ def RSqrt : DXILOp<25, unary> { let attributes = [Attributes]; } -def Round : DXILOp<26, unary> { +def Round : DXILOp<26, unary> { let Doc = "Returns the input rounded to the nearest integer within a " "floating-point type."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -589,10 +594,10 @@ def Round : DXILOp<26, unary> { let attributes = [Attributes]; } -def Floor : DXILOp<27, unary> { +def Floor : DXILOp<27, unary> { let Doc = "Returns the largest integer that is less than or equal to the input."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -600,10 +605,10 @@ def Floor : DXILOp<27, unary> { let attributes = [Attributes]; } -def Ceil : DXILOp<28, unary> { +def Ceil : DXILOp<28, unary> { let Doc = "Returns the smallest integer that is greater than or equal to the " "input."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -611,9 +616,9 @@ def Ceil : DXILOp<28, unary> { let attributes = [Attributes]; } -def Trunc : DXILOp<29, unary> { +def Trunc : DXILOp<29, unary> { let Doc = "Returns the specified value truncated to the integer component."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -621,23 +626,21 @@ def Trunc : DXILOp<29, unary> { let attributes = [Attributes]; } -def Rbits : DXILOp<30, unary> { +def Rbits : DXILOp<30, unary> { let Doc = "Returns the specified value with its bits reversed."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def CountBits : DXILOp<31, unaryBits> { +def CountBits : DXILOp<31, unaryBits> { let Doc = "Returns the number of 1 bits in the specified value."; let arguments = [OverloadTy]; let result = Int32Ty; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } @@ -645,145 +648,133 @@ def CountBits : DXILOp<31, unaryBits> { def FirstbitLo : DXILOp<32, unaryBits> { let Doc = "Returns the location of the first set bit starting from " "the lowest order bit and working upward."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = Int32Ty; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def FirstbitHi : DXILOp<33, unaryBits> { +def FirstbitHi : DXILOp<33, unaryBits> { let Doc = "Returns the location of the first set bit starting from " "the highest order bit and working downward."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = Int32Ty; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def FirstbitSHi : DXILOp<34, unaryBits> { +def FirstbitSHi : DXILOp<34, unaryBits> { let Doc = "Returns the location of the first set bit from " "the highest order bit based on the sign."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = Int32Ty; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def FMax : DXILOp<35, binary> { +def FMax : DXILOp<35, binary> { let Doc = "Float maximum. FMax(a,b) = a > b ? a : b"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def FMin : DXILOp<36, binary> { +def FMin : DXILOp<36, binary> { let Doc = "Float minimum. FMin(a,b) = a < b ? a : b"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def SMax : DXILOp<37, binary> { +def SMax : DXILOp<37, binary> { let Doc = "Signed integer maximum. SMax(a,b) = a > b ? a : b"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def SMin : DXILOp<38, binary> { +def SMin : DXILOp<38, binary> { let Doc = "Signed integer minimum. SMin(a,b) = a < b ? a : b"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def UMax : DXILOp<39, binary> { +def UMax : DXILOp<39, binary> { let Doc = "Unsigned integer maximum. UMax(a,b) = a > b ? a : b"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def UMin : DXILOp<40, binary> { +def UMin : DXILOp<40, binary> { let Doc = "Unsigned integer minimum. UMin(a,b) = a < b ? a : b"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def FMad : DXILOp<46, tertiary> { +def FMad : DXILOp<46, tertiary> { let Doc = "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m " "* a + b."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def IMad : DXILOp<48, tertiary> { +def IMad : DXILOp<48, tertiary> { let Doc = "Signed integer arithmetic multiply/add operation. imad(m,a,b) = m " "* a + b."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def UMad : DXILOp<49, tertiary> { +def UMad : DXILOp<49, tertiary> { let Doc = "Unsigned integer arithmetic multiply/add operation. umad(m,a, = m " "* a + b."; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy, OverloadTy, OverloadTy]; let result = OverloadTy; - let overloads = - [Overloads]; + let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def Dot2 : DXILOp<54, dot2> { +def Dot2 : DXILOp<54, dot2> { let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + " "a[n]*b[n] where n is 0 to 1 inclusive"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = !listsplat(OverloadTy, 4); let result = OverloadTy; let overloads = [Overloads]; @@ -791,10 +782,10 @@ def Dot2 : DXILOp<54, dot2> { let attributes = [Attributes]; } -def Dot3 : DXILOp<55, dot3> { +def Dot3 : DXILOp<55, dot3> { let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + " "a[n]*b[n] where n is 0 to 2 inclusive"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = !listsplat(OverloadTy, 6); let result = OverloadTy; let overloads = [Overloads]; @@ -802,10 +793,10 @@ def Dot3 : DXILOp<55, dot3> { let attributes = [Attributes]; } -def Dot4 : DXILOp<56, dot4> { +def Dot4 : DXILOp<56, dot4> { let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + " "a[n]*b[n] where n is 0 to 3 inclusive"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = !listsplat(OverloadTy, 8); let result = OverloadTy; let overloads = [Overloads]; @@ -866,17 +857,31 @@ def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> { let attributes = [Attributes]; } +def Barrier : DXILOp<80, barrier> { + let Doc = "inserts a memory barrier in the shader"; + let intrinsics = [ + IntrinSelect]>, + ]; + + let arguments = [Int32Ty]; + let result = VoidTy; + let stages = [Stages]; + let attributes = [Attributes]; + let properties = [IsBarrier]; +} + def Discard : DXILOp<82, discard> { let Doc = "discard the current pixel"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [Int1Ty]; let result = VoidTy; let stages = [Stages]; } -def ThreadId : DXILOp<93, threadId> { +def ThreadId : DXILOp<93, threadId> { let Doc = "Reads the thread ID"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -884,9 +889,9 @@ def ThreadId : DXILOp<93, threadId> { let attributes = [Attributes]; } -def GroupId : DXILOp<94, groupId> { +def GroupId : DXILOp<94, groupId> { let Doc = "Reads the group ID (SV_GroupID)"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -894,9 +899,9 @@ def GroupId : DXILOp<94, groupId> { let attributes = [Attributes]; } -def ThreadIdInGroup : DXILOp<95, threadIdInGroup> { +def ThreadIdInGroup : DXILOp<95, threadIdInGroup> { let Doc = "Reads the thread ID within the group (SV_GroupThreadID)"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = OverloadTy; let overloads = [Overloads]; @@ -904,26 +909,26 @@ def ThreadIdInGroup : DXILOp<95, threadIdInGroup> { let attributes = [Attributes]; } -def FlattenedThreadIdInGroup : DXILOp<96, flattenedThreadIdInGroup> { +def FlattenedThreadIdInGroup : DXILOp<96, flattenedThreadIdInGroup> { let Doc = "Provides a flattened index for a given thread within a given " "group (SV_GroupIndex)"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let result = OverloadTy; let overloads = [Overloads]; let stages = [Stages]; let attributes = [Attributes]; } -def MakeDouble : DXILOp<101, makeDouble> { +def MakeDouble : DXILOp<101, makeDouble> { let Doc = "creates a double value"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [Int32Ty, Int32Ty]; let result = DoubleTy; let stages = [Stages]; let attributes = [Attributes]; } -def SplitDouble : DXILOp<102, splitDouble> { +def SplitDouble : DXILOp<102, splitDouble> { let Doc = "Splits a double into 2 uints"; let arguments = [OverloadTy]; let result = SplitDoubleTy; @@ -932,6 +937,89 @@ def SplitDouble : DXILOp<102, splitDouble> { let attributes = [Attributes]; } +def WaveIsFirstLane : DXILOp<110, waveIsFirstLane> { + let Doc = "returns 1 for the first lane in the wave"; + let intrinsics = [IntrinSelect]; + let arguments = []; + let result = Int1Ty; + let stages = [Stages]; + let properties = [IsWave]; +} + +def WaveGetLaneIndex : DXILOp<111, waveGetLaneIndex> { + let Doc = "returns the index of the current lane in the wave"; + let intrinsics = [IntrinSelect]; + let arguments = []; + let result = Int32Ty; + let stages = [Stages]; + let attributes = [Attributes]; + let properties = [IsWave]; +} + +def WaveActiveAnyTrue : DXILOp<113, waveAnyTrue> { + let Doc = "returns true if the expression is true in any of the active lanes " + "in the current wave"; + let intrinsics = [IntrinSelect]; + let arguments = [Int1Ty]; + let result = Int1Ty; + let stages = [Stages]; + let properties = [IsWave]; +} + +def WaveActiveAllTrue : DXILOp<114, waveAllTrue> { + let Doc = "returns true if the expression is true in all of the active lanes " + "in the current wave"; + let intrinsics = [IntrinSelect]; + let arguments = [Int1Ty]; + let result = Int1Ty; + let stages = [Stages]; + let properties = [IsWave]; +} + +def WaveReadLaneAt : DXILOp<117, waveReadLaneAt> { + let Doc = "returns the value from the specified lane"; + let intrinsics = [IntrinSelect]; + let arguments = [OverloadTy, Int32Ty]; + let result = OverloadTy; + let overloads = [Overloads< + DXIL1_0, [HalfTy, FloatTy, DoubleTy, Int1Ty, Int16Ty, Int32Ty, Int64Ty]>]; + let stages = [Stages]; + let properties = [IsWave]; +} + +def WaveActiveOp : DXILOp<119, waveActiveOp> { + let Doc = "returns the result of the operation across waves"; + let intrinsics = [ + IntrinSelect, IntrinArgI8, + IntrinArgI8 + ]>, + IntrinSelect, IntrinArgI8, + IntrinArgI8 + ]>, + ]; + + let arguments = [OverloadTy, Int8Ty, Int8Ty]; + let result = OverloadTy; + let overloads = [ + Overloads + ]; + let stages = [Stages]; + let attributes = [Attributes]; +} + +def WaveAllBitCount : DXILOp<135, waveAllOp> { + let Doc = "returns the count of bits set to 1 across the wave"; + let intrinsics = [IntrinSelect]; + let arguments = [Int1Ty]; + let result = Int32Ty; + let stages = [Stages]; + let properties = [IsWave]; +} + def RawBufferLoad : DXILOp<139, rawBufferLoad> { let Doc = "reads from a raw buffer and structured buffer"; // Handle, Coord0, Coord1, Mask, Alignment @@ -972,7 +1060,7 @@ def RawBufferStore : DXILOp<140, rawBufferStore> { def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { let Doc = "signed dot product of 4 x i8 vectors packed into i32, with " "accumulate to i32"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [Int32Ty, Int32Ty, Int32Ty]; let result = Int32Ty; let stages = [Stages]; @@ -982,7 +1070,7 @@ def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { def Dot4AddU8Packed : DXILOp<164, dot4AddPacked> { let Doc = "unsigned dot product of 4 x i8 vectors packed into i32, with " "accumulate to i32"; - let intrinsics = [ IntrinSelect ]; + let intrinsics = [IntrinSelect]; let arguments = [Int32Ty, Int32Ty, Int32Ty]; let result = Int32Ty; let stages = [Stages]; @@ -1004,92 +1092,3 @@ def CreateHandleFromBinding : DXILOp<217, createHandleFromBinding> { let stages = [Stages]; let attributes = [Attributes]; } - -def WaveActiveAllTrue : DXILOp<114, waveAllTrue> { - let Doc = "returns true if the expression is true in all of the active lanes in the current wave"; - let intrinsics = [ IntrinSelect ]; - let arguments = [Int1Ty]; - let result = Int1Ty; - let stages = [Stages]; - let properties = [IsWave]; -} - -def WaveActiveAnyTrue : DXILOp<113, waveAnyTrue> { - let Doc = "returns true if the expression is true in any of the active lanes in the current wave"; - let intrinsics = [ IntrinSelect ]; - let arguments = [Int1Ty]; - let result = Int1Ty; - let stages = [Stages]; - let properties = [IsWave]; -} - -def WaveActiveOp : DXILOp<119, waveActiveOp> { - let Doc = "returns the result of the operation across waves"; - let intrinsics = [ - IntrinSelect< - int_dx_wave_reduce_sum, - [ IntrinArgIndex<0>, IntrinArgI8, IntrinArgI8 ]>, - IntrinSelect< - int_dx_wave_reduce_usum, - [ IntrinArgIndex<0>, IntrinArgI8, IntrinArgI8 ]>, - ]; - - let arguments = [OverloadTy, Int8Ty, Int8Ty]; - let result = OverloadTy; - let overloads = [Overloads]; - let stages = [Stages]; - let attributes = [Attributes]; -} - -def WaveIsFirstLane : DXILOp<110, waveIsFirstLane> { - let Doc = "returns 1 for the first lane in the wave"; - let intrinsics = [ IntrinSelect ]; - let arguments = []; - let result = Int1Ty; - let stages = [Stages]; - let properties = [IsWave]; -} - -def WaveReadLaneAt: DXILOp<117, waveReadLaneAt> { - let Doc = "returns the value from the specified lane"; - let intrinsics = [ IntrinSelect ]; - let arguments = [OverloadTy, Int32Ty]; - let result = OverloadTy; - let overloads = [Overloads]; - let stages = [Stages]; - let properties = [IsWave]; -} - -def WaveGetLaneIndex : DXILOp<111, waveGetLaneIndex> { - let Doc = "returns the index of the current lane in the wave"; - let intrinsics = [ IntrinSelect ]; - let arguments = []; - let result = Int32Ty; - let stages = [Stages]; - let attributes = [Attributes]; - let properties = [IsWave]; -} - -def WaveAllBitCount : DXILOp<135, waveAllOp> { - let Doc = "returns the count of bits set to 1 across the wave"; - let intrinsics = [ IntrinSelect ]; - let arguments = [Int1Ty]; - let result = Int32Ty; - let stages = [Stages]; - let properties = [IsWave]; -} - -def Barrier : DXILOp<80, barrier> { - let Doc = "inserts a memory barrier in the shader"; - let intrinsics = [ - IntrinSelect< - int_dx_group_memory_barrier_with_group_sync, - [ IntrinArgI32 ]>, - ]; - - let arguments = [Int32Ty]; - let result = VoidTy; - let stages = [Stages]; - let attributes = [Attributes]; - let properties = [IsBarrier]; -} From 4bd0440bd2a653644987dddf8ec8d9d1f258ce31 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 23 Jan 2025 08:20:50 -0800 Subject: [PATCH 157/208] [RISCV] Add RVVConstraint to SiFive custom matrix multiply instructions. (#124055) The instructions don't allow the vs1 encoded register to overlap vd. Confusingly these instructions order their operands vd, vs1, vs2 while every other vector instruction is vd, vs2, vs1. So we need to use VS2Constraint for this since it checks the first operand after vd. 2 of the 3 extensions have instruction that produce a result with EMUL=2*LMUL. This makes them subject to the widening constraints for vs2. So for these extensions we use WidenV which includes VS2Constraint. --- llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 6 +-- llvm/test/MC/RISCV/rvv/xsfvfwmacc-invalid.s | 10 +++++ llvm/test/MC/RISCV/rvv/xsfvqmacc-invalid.s | 50 +++++++++++++++++++++ 3 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 llvm/test/MC/RISCV/rvv/xsfvfwmacc-invalid.s create mode 100644 llvm/test/MC/RISCV/rvv/xsfvqmacc-invalid.s diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 20adda91f6bde1..d5105a9edb0290 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -202,7 +202,7 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0, } let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvqmaccdod", - DestEEW = EEWSEWx4 in { + DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in { def VQMACCU_2x8x2 : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">; def VQMACC_2x8x2 : CustomSiFiveVMACC<0b101101, OPMVV, "sf.vqmacc.2x8x2">; def VQMACCUS_2x8x2 : CustomSiFiveVMACC<0b101110, OPMVV, "sf.vqmaccus.2x8x2">; @@ -210,7 +210,7 @@ let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvqmaccdod", } let Predicates = [HasVendorXSfvqmaccqoq], DecoderNamespace = "XSfvqmaccqoq", - DestEEW = EEWSEWx4 in { + DestEEW = EEWSEWx4, RVVConstraint=WidenV in { def VQMACCU_4x8x4 : CustomSiFiveVMACC<0b111100, OPMVV, "sf.vqmaccu.4x8x4">; def VQMACC_4x8x4 : CustomSiFiveVMACC<0b111101, OPMVV, "sf.vqmacc.4x8x4">; def VQMACCUS_4x8x4 : CustomSiFiveVMACC<0b111110, OPMVV, "sf.vqmaccus.4x8x4">; @@ -218,7 +218,7 @@ let Predicates = [HasVendorXSfvqmaccqoq], DecoderNamespace = "XSfvqmaccqoq", } let Predicates = [HasVendorXSfvfwmaccqqq], DecoderNamespace = "XSfvfwmaccqqq", - DestEEW = EEWSEWx2 in { + DestEEW = EEWSEWx2, RVVConstraint=WidenV in { def VFWMACC_4x4x4 : CustomSiFiveVMACC<0b111100, OPFVV, "sf.vfwmacc.4x4x4">; } diff --git a/llvm/test/MC/RISCV/rvv/xsfvfwmacc-invalid.s b/llvm/test/MC/RISCV/rvv/xsfvfwmacc-invalid.s new file mode 100644 index 00000000000000..452778613adf67 --- /dev/null +++ b/llvm/test/MC/RISCV/rvv/xsfvfwmacc-invalid.s @@ -0,0 +1,10 @@ +# RUN: not llvm-mc -triple=riscv64 -show-encoding -mattr=+v,+xsfvfwmaccqqq %s 2>&1 \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ERROR + +sf.vfwmacc.4x4x4 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vfwmacc.4x4x4 v8, v8, v20{{$}} + +sf.vfwmacc.4x4x4 v8, v4, v8 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vfwmacc.4x4x4 v8, v4, v8{{$}} diff --git a/llvm/test/MC/RISCV/rvv/xsfvqmacc-invalid.s b/llvm/test/MC/RISCV/rvv/xsfvqmacc-invalid.s new file mode 100644 index 00000000000000..a7cbfbcb1f3654 --- /dev/null +++ b/llvm/test/MC/RISCV/rvv/xsfvqmacc-invalid.s @@ -0,0 +1,50 @@ +# RUN: not llvm-mc -triple=riscv64 -show-encoding -mattr=+v,+xsfvqmaccqoq,+xsfvqmaccdod %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR + +sf.vqmaccu.2x8x2 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccu.2x8x2 v8, v8, v20 + +sf.vqmacc.2x8x2 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmacc.2x8x2 v8, v8, v20 + +sf.vqmaccus.2x8x2 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccus.2x8x2 v8, v8, v20 + +sf.vqmaccsu.2x8x2 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccsu.2x8x2 v8, v8, v20 + +sf.vqmaccu.4x8x4 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccu.4x8x4 v8, v8, v20 + +sf.vqmacc.4x8x4 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmacc.4x8x4 v8, v8, v20 + +sf.vqmaccus.4x8x4 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccus.4x8x4 v8, v8, v20 + +sf.vqmaccsu.4x8x4 v8, v8, v20 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccsu.4x8x4 v8, v8, v20 + +sf.vqmaccu.4x8x4 v8, v4, v8 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccu.4x8x4 v8, v4, v8 + +sf.vqmacc.4x8x4 v8, v4, v8 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmacc.4x8x4 v8, v4, v8 + +sf.vqmaccus.4x8x4 v8, v4, v8 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccus.4x8x4 v8, v4, v8 + +sf.vqmaccsu.4x8x4 v8, v4, v8 +# CHECK-ERROR: the destination vector register group cannot overlap the source vector register group{{$}} +# CHECK-ERROR-LABEL: sf.vqmaccsu.4x8x4 v8, v4, v8 From 1937a36209bc5f3636e7c98a1638ee9f082b4d2b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 23 Jan 2025 08:26:39 -0800 Subject: [PATCH 158/208] [RISCV] Add @earlyclobber to SiFive custom matrix multiply instruction. (#124060) All of these have a constraint that vd and vs1 cannot overlap. Some of them have an additional widening constraint for vs2. We should use earlyclobber to protect this. This is unlikely to be an issue in practice due to the instrinsic being ternary so vd is also a source. The intrinsic has a different type for this source than the other sources. You would have to do something crazy to get the register allocator to overlap the registers. --- llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index d5105a9edb0290..0654f1ac19a82c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -374,28 +374,28 @@ multiclass VPseudoVC_XVW { +multiclass VPseudoSiFiveVMACC { def "Pseudo" # NAME # "_" # mx - : VPseudoTernaryNoMaskWithPolicy; + : VPseudoTernaryNoMaskWithPolicy; } -multiclass VPseudoSiFiveVQMACCDOD { +multiclass VPseudoSiFiveVQMACCDOD { foreach m = MxListVF8 in let VLMul = m.value in - defm NAME : VPseudoSiFiveVMACC; + defm NAME : VPseudoSiFiveVMACC; } -multiclass VPseudoSiFiveVQMACCQOQ { +multiclass VPseudoSiFiveVQMACCQOQ { foreach m = [V_MF2, V_M1, V_M2, V_M4] in let VLMul = m.value in - defm NAME : VPseudoSiFiveVMACC; + defm NAME : VPseudoSiFiveVMACC; } -multiclass VPseudoSiFiveVFWMACC { +multiclass VPseudoSiFiveVFWMACC { foreach m = MxListVF2 in let VLMul = m.value in - defm NAME : VPseudoSiFiveVMACC; + defm NAME : VPseudoSiFiveVMACC; } multiclass VPseudoSiFiveVFNRCLIP { From df299958e64c73d73b427afc70b960ec039586ac Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 23 Jan 2025 08:31:18 -0800 Subject: [PATCH 159/208] [mlir] Fix warnings This patch fixes: mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp:403:5: error: 'ClampRange' may not intend to support class template argument deduction [-Werror,-Wctad-maybe-unsupported] mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp:404:5: error: 'ClampRange' may not intend to support class template argument deduction [-Werror,-Wctad-maybe-unsupported] --- mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index 8b883487d1659b..b8e0005dc1bc03 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -400,8 +400,8 @@ struct ClampClampOptimization : public OpRewritePattern { const auto opMaxFloat = op.getMaxFp(); const auto clampOpMinFloat = clampOp.getMinFp(); const auto clampOpMaxFloat = clampOp.getMaxFp(); - ClampRange opRangeFloatRange(opMinFloat, opMaxFloat); - ClampRange clampRangeFloatRange(clampOpMinFloat, clampOpMaxFloat); + ClampRange opRangeFloatRange(opMinFloat, opMaxFloat); + ClampRange clampRangeFloatRange(clampOpMinFloat, clampOpMaxFloat); if (!opRangeFloatRange.intersects(clampRangeFloatRange)) return failure(); From bca6dbd3a241f4a2cb6cfa5ed4c2f94cf76d3f17 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 23 Jan 2025 17:34:34 +0100 Subject: [PATCH 160/208] [X86] Add additional i128 abi test (NFC) --- llvm/test/CodeGen/X86/i128-abi.ll | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llvm/test/CodeGen/X86/i128-abi.ll b/llvm/test/CodeGen/X86/i128-abi.ll index d1d6f86e08fb8b..23eb6ec0322abd 100644 --- a/llvm/test/CodeGen/X86/i128-abi.ll +++ b/llvm/test/CodeGen/X86/i128-abi.ll @@ -19,6 +19,15 @@ define i128 @on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5) { ret i128 %a5 } +define i128 @on_stack2(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i128 %a6) { +; CHECK-LABEL: on_stack2: +; CHECK: # %bb.0: +; CHECK-NEXT: movq 24(%rsp), %rax +; CHECK-NEXT: movq 32(%rsp), %rdx +; CHECK-NEXT: retq + ret i128 %a6 +} + define i64 @trailing_arg_on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i64 %a6) { ; CHECK-LABEL: trailing_arg_on_stack: ; CHECK: # %bb.0: From 7db4ba3916d33e57fb5244214f4873bf74e273f0 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Thu, 23 Jan 2025 11:36:53 -0500 Subject: [PATCH 161/208] [GlobalMerge][NFC] Fix inaccurate comments (#124136) I was studying the code here and realized that the comments were talking about grouping by basic blocks when the code was grouping by Function. Fix the comments so they reflect what the code is actually doing. --- llvm/lib/CodeGen/GlobalMerge.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 48d4d7848d84a7..9d4547df046d46 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -378,7 +378,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, size_t UGSIdx = GlobalUsesByFunction[ParentFn]; - // If this is the first global the basic block uses, map it to the set + // If this is the first global the function uses, map it to the set // consisting of this global only. if (!UGSIdx) { // If that set doesn't exist yet, create it. @@ -393,7 +393,8 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, continue; } - // If we already encountered this BB, just increment the counter. + // If we already encountered a use of this global in this function, just + // increment the counter. if (UsedGlobalSets[UGSIdx].Globals.test(GI)) { ++UsedGlobalSets[UGSIdx].UsageCount; continue; @@ -423,7 +424,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, } // Now we found a bunch of sets of globals used together. We accumulated - // the number of times we encountered the sets (i.e., the number of blocks + // the number of times we encountered the sets (i.e., the number of functions // that use that exact set of globals). // // Multiply that by the size of the set to give us a crude profitability From fa299294c068b1857d8d7ee74a512080898f194d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 23 Jan 2025 08:18:09 -0800 Subject: [PATCH 162/208] [SLP][NFC]Modernize code base in several places --- .../Transforms/Vectorize/SLPVectorizer.cpp | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fc6bba6d2b8b3b..a733c3a02bbc88 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4142,8 +4142,9 @@ class BoUpSLP { // through the TreeEntry. if (TreeEntry *TE = BundleMember->TE) { // Need to search for the lane since the tree entry can be reordered. + auto *In = BundleMember->Inst; int Lane = std::distance(TE->Scalars.begin(), - find(TE->Scalars, BundleMember->Inst)); + find(TE->Scalars, In)); assert(Lane >= 0 && "Lane not set"); // Since vectorization tree is being built recursively this assertion @@ -4152,16 +4153,13 @@ class BoUpSLP { // where their second (immediate) operand is not added. Since // immediates do not affect scheduler behavior this is considered // okay. - auto *In = BundleMember->Inst; assert( In && (isa(In) || In->getNumOperands() == TE->getNumOperands()) && "Missed TreeEntry operands?"); - (void)In; // fake use to avoid build failure when assertions disabled - for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); - OpIdx != NumOperands; ++OpIdx) + for (unsigned OpIdx : seq(TE->getNumOperands())) if (auto *I = dyn_cast(TE->getOperand(OpIdx)[Lane])) DecrUnsched(I); } else { @@ -9093,7 +9091,7 @@ static bool isAlternateInstruction(const Instruction *I, if (auto *MainCI = dyn_cast(MainOp)) { auto *AltCI = cast(AltOp); CmpInst::Predicate MainP = MainCI->getPredicate(); - CmpInst::Predicate AltP = AltCI->getPredicate(); + [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate(); assert(MainP != AltP && "Expected different main/alternate predicates."); auto *CI = cast(I); if (isCmpSameOrSwapped(MainCI, CI, TLI)) @@ -9106,7 +9104,6 @@ static bool isAlternateInstruction(const Instruction *I, assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && "CmpInst expected to match either main or alternate predicate or " "their swap."); - (void)AltP; return MainP != P && MainP != SwappedP; } return I->getOpcode() == AltOp->getOpcode(); @@ -12379,8 +12376,7 @@ static T *performExtractsShuffleAction( else Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; } - auto *V = ValueSelect::get(Base); - (void)V; + [[maybe_unused]] auto *V = ValueSelect::get(Base); assert((!V || GetVF(V) == Mask.size()) && "Expected base vector of VF number of elements."); Prev = Action(Mask, {nullptr, Res.first}); @@ -12431,8 +12427,7 @@ static T *performExtractsShuffleAction( } VMIt = std::next(VMIt); } - bool IsBaseNotUndef = !IsBaseUndef.all(); - (void)IsBaseNotUndef; + [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all(); // Perform requested actions for the remaining masks/vectors. for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { // Shuffle other input vectors, if any. @@ -17626,8 +17621,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { if (ScheduleData *SD = BS->getScheduleData(I)) { - TreeEntry *SDTE = getTreeEntry(SD->Inst); - (void)SDTE; + [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst); assert((isVectorLikeInstWithConstOps(SD->Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && From e622468f164f6ba223e6862d8235eea5f555f927 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 23 Jan 2025 08:45:02 -0800 Subject: [PATCH 163/208] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#124074) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect CO to be nonnull. --- clang/lib/AST/JSONNodeDumper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp index ddbe2136a671f3..36ef1fc8c79db0 100644 --- a/clang/lib/AST/JSONNodeDumper.cpp +++ b/clang/lib/AST/JSONNodeDumper.cpp @@ -1537,9 +1537,9 @@ void JSONNodeDumper::VisitExprWithCleanups(const ExprWithCleanups *EWC) { if (EWC->getNumObjects()) { JOS.attributeArray("cleanups", [this, EWC] { for (const ExprWithCleanups::CleanupObject &CO : EWC->getObjects()) - if (auto *BD = CO.dyn_cast()) { + if (auto *BD = dyn_cast(CO)) { JOS.value(createBareDeclRef(BD)); - } else if (auto *CLE = CO.dyn_cast()) { + } else if (auto *CLE = dyn_cast(CO)) { llvm::json::Object Obj; Obj["id"] = createPointerRepresentation(CLE); Obj["kind"] = CLE->getStmtClassName(); From 113e1fdc8c7f9085d5a48ca16b270cf53e9f189d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 23 Jan 2025 08:45:26 -0800 Subject: [PATCH 164/208] [CodeGen] Migrate away from PointerUnion::dyn_cast (NFC) (#124076) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Pos to be nonnull. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index ddcb04d53661d0..5ae3fe694d0e6e 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4079,7 +4079,7 @@ static void emitDependData(CodeGenFunction &CGF, QualType &KmpDependInfoTy, Size = llvm::ConstantInt::get(CGF.SizeTy, 0); } LValue Base; - if (unsigned *P = Pos.dyn_cast()) { + if (unsigned *P = dyn_cast(Pos)) { Base = CGF.MakeAddrLValue( CGF.Builder.CreateConstGEP(DependenciesArray, *P), KmpDependInfoTy); } else { @@ -4109,7 +4109,7 @@ static void emitDependData(CodeGenFunction &CGF, QualType &KmpDependInfoTy, CGF.EmitStoreOfScalar( llvm::ConstantInt::get(LLVMFlagsTy, static_cast(DepKind)), FlagsLVal); - if (unsigned *P = Pos.dyn_cast()) { + if (unsigned *P = dyn_cast(Pos)) { ++(*P); } else { LValue &PosLVal = *cast(Pos); From d05008363d4ed87b1350701831032ea5070d5b98 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 23 Jan 2025 08:45:59 -0800 Subject: [PATCH 165/208] [lldb] Avoid repeated map lookups (NFC) (#124077) --- lldb/source/Target/DynamicRegisterInfo.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lldb/source/Target/DynamicRegisterInfo.cpp b/lldb/source/Target/DynamicRegisterInfo.cpp index 1a817449fa9589..9ad98a41c688c8 100644 --- a/lldb/source/Target/DynamicRegisterInfo.cpp +++ b/lldb/source/Target/DynamicRegisterInfo.cpp @@ -460,8 +460,8 @@ void DynamicRegisterInfo::Finalize(const ArchSpec &arch) { // Now update all value_regs with each register info as needed const size_t num_regs = m_regs.size(); for (size_t i = 0; i < num_regs; ++i) { - if (m_value_regs_map.find(i) != m_value_regs_map.end()) - m_regs[i].value_regs = m_value_regs_map[i].data(); + if (auto it = m_value_regs_map.find(i); it != m_value_regs_map.end()) + m_regs[i].value_regs = it->second.data(); else m_regs[i].value_regs = nullptr; } @@ -509,8 +509,9 @@ void DynamicRegisterInfo::Finalize(const ArchSpec &arch) { // Now update all invalidate_regs with each register info as needed for (size_t i = 0; i < num_regs; ++i) { - if (m_invalidate_regs_map.find(i) != m_invalidate_regs_map.end()) - m_regs[i].invalidate_regs = m_invalidate_regs_map[i].data(); + if (auto it = m_invalidate_regs_map.find(i); + it != m_invalidate_regs_map.end()) + m_regs[i].invalidate_regs = it->second.data(); else m_regs[i].invalidate_regs = nullptr; } From bb019dd165ceeb5b9c9e4a0bf3c9ee9bc886e7fc Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 23 Jan 2025 08:46:19 -0800 Subject: [PATCH 166/208] [CodeGen] Avoid repeated hash lookups (NFC) (#124078) --- llvm/lib/CodeGen/MachineLoopUtils.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLoopUtils.cpp b/llvm/lib/CodeGen/MachineLoopUtils.cpp index 0e8335d4974d72..e869eed2ea1b16 100644 --- a/llvm/lib/CodeGen/MachineLoopUtils.cpp +++ b/llvm/lib/CodeGen/MachineLoopUtils.cpp @@ -76,8 +76,9 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction, for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I) for (MachineOperand &MO : I->uses()) - if (MO.isReg() && Remaps.count(MO.getReg())) - MO.setReg(Remaps[MO.getReg()]); + if (MO.isReg()) + if (auto It = Remaps.find(MO.getReg()); It != Remaps.end()) + MO.setReg(It->second); for (auto I = NewBB->begin(); I->isPHI(); ++I) { MachineInstr &MI = *I; @@ -90,8 +91,8 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction, // When peeling front, we are only left with the initial value from the // preheader. Register R = MI.getOperand(LoopRegIdx).getReg(); - if (Remaps.count(R)) - R = Remaps[R]; + if (auto It = Remaps.find(R); It != Remaps.end()) + R = It->second; OrigPhi.getOperand(InitRegIdx).setReg(R); MI.removeOperand(LoopRegIdx + 1); MI.removeOperand(LoopRegIdx + 0); From bda39a6067833c9353adbc42bddb1b5808bcf44b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 23 Jan 2025 08:46:47 -0800 Subject: [PATCH 167/208] [ExecutionEngine] Include (#124083) This patch reinstates an include of , fixing a build failure caused by: commit 1f4d91ecb8529678a3d3919d7523743bd21942ca Author: Kazu Hirata Date: Tue Nov 19 19:41:59 2024 -0800 [ExecutionEngine] Remove unused includes (NFC) (#116749) --------- Co-authored-by: h-vetinari --- llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp index fb7cf94fa0654a..5a4698f0fa68d5 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp @@ -15,6 +15,7 @@ #if LLVM_USE_INTEL_JITEVENTS #include "IntelJITEventsWrapper.h" #include "ittnotify.h" +#include using namespace llvm; using namespace llvm::orc; From 7ddeea359811ec49a07db948bbf3f6b6c915f675 Mon Sep 17 00:00:00 2001 From: Acim Maravic Date: Thu, 23 Jan 2025 17:55:00 +0100 Subject: [PATCH 168/208] [LLVM][AMDGPU] MC support for ds_bpermute_fi_b32 (#124108) Added assembler/disassembler support for ds_bpermute_fi_b32 instruction, as well as tests. --- llvm/lib/Target/AMDGPU/DSInstructions.td | 2 ++ llvm/test/MC/AMDGPU/gfx12_asm_ds.s | 12 ++++++++++++ llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 9 +++++++++ 3 files changed, 23 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index ef618727258cf2..bc217e10e0fbd7 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -699,6 +699,7 @@ def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32", int_amdgcn_ds_permute>; def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", int_amdgcn_ds_bpermute>; +def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32">; } } // let SubtargetPredicate = isGFX8Plus @@ -1264,6 +1265,7 @@ defm DS_PK_ADD_F16 : DS_Real_gfx12<0x09a>; defm DS_PK_ADD_RTN_F16 : DS_Real_gfx12<0x0aa>; defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>; defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>; +defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>; // New aliases added in GFX12 without renaming the instructions. let AssemblerPredicate = isGFX12Plus in { diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s index a0e6a3a613555a..34c42affdd46cc 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s @@ -1910,3 +1910,15 @@ ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "01pip") // GFX12: [0x07,0x09,0xd4,0xd8,0x02,0x00,0x00,0x08] + +ds_bpermute_fi_b32 v5, v1, v2 +// GFX12: encoding: [0x00,0x00,0x34,0xdb,0x01,0x02,0x00,0x05] + +ds_bpermute_fi_b32 v5, v1, v2 offset:65535 +// GFX12: encoding: [0xff,0xff,0x34,0xdb,0x01,0x02,0x00,0x05] + +ds_bpermute_fi_b32 v5, v1, v2 offset:0 +// GFX12: encoding: [0x00,0x00,0x34,0xdb,0x01,0x02,0x00,0x05] + +ds_bpermute_fi_b32 v255, v255, v255 offset:4 +// GFX12: encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index 080a4cab2a319d..d66748135ffd42 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -3233,3 +3233,12 @@ # GFX12: ds_xor_rtn_b64 v[5:6], v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xac,0xd9,0xff,0x02,0x00,0x05] 0xff,0xff,0xac,0xd9,0xff,0x02,0x00,0x05 + +# GFX12: ds_bpermute_fi_b32 v5, v1, v2 ; encoding: [0x00,0x00,0x34,0xdb,0x01,0x02,0x00,0x05] +0x00,0x00,0x34,0xdb,0x01,0x02,0x00,0x05 + +# GFX12: ds_bpermute_fi_b32 v5, v1, v2 offset:65535 ; encoding: [0xff,0xff,0x34,0xdb,0x01,0x02,0x00,0x05] +0xff,0xff,0x34,0xdb,0x01,0x02,0x00,0x05 + +# GFX12: ds_bpermute_fi_b32 v255, v255, v255 offset:4 ; encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff] +0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff From d8cd8d56ea980d9a9c1e70bcc2dd7207d1236f94 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 23 Jan 2025 16:57:13 +0000 Subject: [PATCH 169/208] [SLP] getSpillCost - fully populate IntrinsicCostAttributes to improve cost analysis. (#124129) We were only constructing the IntrinsicCostAttributes with the arg type info, and not the args themselves, preventing more detailed cost analysis (constant / uniform args etc.) Just pass the whole IntrinsicInst to the constructor and let it resolve everything it can. Noticed while having yet another attempt at #63980 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 14 ++--- .../SLPVectorizer/AArch64/loadorder.ll | 33 +++++----- .../SLPVectorizer/AArch64/reduce-fadd.ll | 28 ++++----- .../SLPVectorizer/AMDGPU/min_max.ll | 8 +-- .../SLPVectorizer/RISCV/complex-loads.ll | 62 +++++++++++-------- 5 files changed, 72 insertions(+), 73 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a733c3a02bbc88..4de632d4ef149d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12253,18 +12253,12 @@ InstructionCost BoUpSLP::getSpillCost() const { if (auto *II = dyn_cast(I)) { if (II->isAssumeLikeIntrinsic()) return true; - FastMathFlags FMF; - SmallVector Tys; - for (auto &ArgOp : II->args()) - Tys.push_back(ArgOp->getType()); - if (auto *FPMO = dyn_cast(II)) - FMF = FPMO->getFastMathFlags(); - IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys, - FMF); + IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II); InstructionCost IntrCost = TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput); - InstructionCost CallCost = TTI->getCallInstrCost( - nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput); + InstructionCost CallCost = + TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(), + TTI::TCK_RecipThroughput); if (IntrCost < CallCost) return true; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index 9ce79e5ea356b9..5ad676537f9c45 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -684,27 +684,27 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[MUL]], 2 ; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4 ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 ; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 ; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 -; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] ; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 -; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4 ; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24 @@ -715,25 +715,22 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28 ; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 32 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44 -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 36 ; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX65]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4 ; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll index 00a4417ba7aff2..6576cbe075b740 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll @@ -716,29 +716,29 @@ define float @reduce_float_case3(ptr %a) { ; CHECK-NEXT: [[GEP5:%.*]] = getelementptr inbounds float, ptr [[A]], i32 5 ; CHECK-NEXT: [[GEP6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 6 ; CHECK-NEXT: [[GEP7:%.*]] = getelementptr inbounds float, ptr [[A]], i32 7 -; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4 -; CHECK-NEXT: [[LOAD5:%.*]] = load float, ptr [[GEP5]], align 4 -; CHECK-NEXT: [[LOAD6:%.*]] = load float, ptr [[GEP6]], align 4 -; CHECK-NEXT: [[LOAD7:%.*]] = load float, ptr [[GEP7]], align 4 -; CHECK-NEXT: [[LOG:%.*]] = call float @llvm.log.f32(float [[LOAD]]) -; CHECK-NEXT: [[LOG1:%.*]] = call float @llvm.log.f32(float [[LOAD1]]) +; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[LOAD5:%.*]] = load float, ptr [[GEP3]], align 4 +; CHECK-NEXT: [[LOAD6:%.*]] = load float, ptr [[GEP4]], align 4 +; CHECK-NEXT: [[LOAD7:%.*]] = load float, ptr [[GEP5]], align 4 +; CHECK-NEXT: [[LOAD8:%.*]] = load float, ptr [[GEP6]], align 4 +; CHECK-NEXT: [[LOAD9:%.*]] = load float, ptr [[GEP7]], align 4 ; CHECK-NEXT: [[LOG2:%.*]] = call float @llvm.log.f32(float [[LOAD2]]) ; CHECK-NEXT: [[LOG3:%.*]] = call float @llvm.log.f32(float [[LOAD3]]) ; CHECK-NEXT: [[LOG4:%.*]] = call float @llvm.log.f32(float [[LOAD4]]) ; CHECK-NEXT: [[LOG5:%.*]] = call float @llvm.log.f32(float [[LOAD5]]) ; CHECK-NEXT: [[LOG6:%.*]] = call float @llvm.log.f32(float [[LOAD6]]) ; CHECK-NEXT: [[LOG7:%.*]] = call float @llvm.log.f32(float [[LOAD7]]) -; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[LOG]], [[LOG1]] -; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[ADD1]], [[LOG2]] -; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[ADD2]], [[LOG3]] +; CHECK-NEXT: [[LOG8:%.*]] = call float @llvm.log.f32(float [[LOAD8]]) +; CHECK-NEXT: [[LOG9:%.*]] = call float @llvm.log.f32(float [[LOAD9]]) +; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[LOG2]], [[LOG3]] ; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[ADD3]], [[LOG4]] ; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD4]], [[LOG5]] ; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[ADD5]], [[LOG6]] -; CHECK-NEXT: [[ADD7:%.*]] = fadd float [[ADD6]], [[LOG7]] +; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD6]], [[LOG7]] +; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[ADD8]], [[LOG8]] +; CHECK-NEXT: [[ADD7:%.*]] = fadd float [[ADD9]], [[LOG9]] ; CHECK-NEXT: ret float [[ADD7]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll index 46c6c10125b95f..a3be8f5e935c9e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll @@ -358,12 +358,12 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1 ; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]]) -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]]) +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0 -; GFX8-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GFX8-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX8-NEXT: [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> -; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> +; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP2]], <4 x i32> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 257e4660c80aab..11fa3337544a1a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -28,13 +28,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> @@ -50,7 +46,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] @@ -64,8 +59,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] ; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]] ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]] +; CHECK-NEXT: [[CONV_2:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 +; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV_2]], [[TMP43]] ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 ; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]] @@ -120,15 +115,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]] ; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]] ; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15 -; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15 -; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 -; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15 +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP77]], 15 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 ; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 ; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15 @@ -244,10 +231,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]] ; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]] ; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]] -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I_1]], [[ADD103]] ; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] -; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]] +; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_1]], [[ADD105]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV_2]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] ; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]] ; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] @@ -255,21 +242,42 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] ; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]] -; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]] -; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] +; CHECK-NEXT: [[TMP169:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP181:%.*]] = zext <2 x i8> [[TMP169]] to <2 x i32> +; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0 +; CHECK-NEXT: [[TMP182:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0 +; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP183]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP191:%.*]] = sub <2 x i32> [[TMP182]], [[TMP184]] +; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP182]], [[TMP184]] +; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP191]], <2 x i32> [[TMP192]], <2 x i32> +; CHECK-NEXT: [[TMP195:%.*]] = lshr <2 x i32> [[TMP181]], splat (i32 15) +; CHECK-NEXT: [[TMP196:%.*]] = and <2 x i32> [[TMP195]], splat (i32 65537) +; CHECK-NEXT: [[TMP198:%.*]] = mul <2 x i32> [[TMP196]], splat (i32 65535) +; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55]], i32 0 +; CHECK-NEXT: [[TMP203:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0 +; CHECK-NEXT: [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP207:%.*]] = sub <2 x i32> [[TMP203]], [[TMP206]] +; CHECK-NEXT: [[TMP210:%.*]] = add <2 x i32> [[TMP203]], [[TMP206]] +; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP210]], <2 x i32> +; CHECK-NEXT: [[ADD94_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1 +; CHECK-NEXT: [[ADD78_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1 ; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] +; CHECK-NEXT: [[TMP220:%.*]] = add <2 x i32> [[TMP194]], [[TMP168]] +; CHECK-NEXT: [[SUB102_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0 +; CHECK-NEXT: [[SUB86_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0 +; CHECK-NEXT: [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP194]], <2 x i32> ; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]] -; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]] +; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP198]], [[TMP220]] +; CHECK-NEXT: [[TMP221:%.*]] = xor <2 x i32> [[TMP175]], [[TMP181]] ; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] ; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]] ; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] ; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]] +; CHECK-NEXT: [[XOR_I53_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 0 ; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]] +; CHECK-NEXT: [[XOR_I_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 1 ; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] ; CHECK-NEXT: [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] ; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]] From 2f76e2b27d9ddd4fa0a1098f77b96fa51905bdb1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 23 Jan 2025 11:59:57 -0500 Subject: [PATCH 170/208] [Driver] -fno-plt: warn for unsupported targets -fno-plt is an ELF specific option that is only implemented for x86 (for a long time) and AArch64 (#78890). GCC doesn't bother to give a diagnostic on Windows. -fno-plt is somewhat popular and we've been ignoring it for unsupported targets for a while, so just report a warning for unsupported targets. Pull Request: https://github.com/llvm/llvm-project/pull/124081 --- clang/lib/Driver/ToolChains/Clang.cpp | 5 ++--- clang/test/Driver/fno-plt.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 clang/test/Driver/fno-plt.c diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 33f08cf28feca1..518113e20cb063 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6141,9 +6141,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fno-direct-access-external-data"); } - if (Args.hasFlag(options::OPT_fno_plt, options::OPT_fplt, false)) { - CmdArgs.push_back("-fno-plt"); - } + if (Triple.isOSBinFormatELF() && (Triple.isAArch64() || Triple.isX86())) + Args.addOptOutFlag(CmdArgs, options::OPT_fplt, options::OPT_fno_plt); // -fhosted is default. // TODO: Audit uses of KernelOrKext and see where it'd be more appropriate to diff --git a/clang/test/Driver/fno-plt.c b/clang/test/Driver/fno-plt.c new file mode 100644 index 00000000000000..c7bd7130593be0 --- /dev/null +++ b/clang/test/Driver/fno-plt.c @@ -0,0 +1,10 @@ +// RUN: %clang -### -c --target=aarch64 -fno-plt -Werror %s 2>&1 | FileCheck %s --check-prefix=NOPLT +// RUN: %clang -### -c --target=x86_64 -fno-plt -Werror %s 2>&1 | FileCheck %s --check-prefix=NOPLT + +// RUN: %clang -### -c --target=aarch64 -fno-plt -fplt -Werror %s 2>&1 | FileCheck %s --check-prefix=DEFAULT +// RUN: %clang -### -c --target=powerpc64 -fno-plt %s 2>&1 | FileCheck %s --check-prefixes=WARN,DEFAULT +// RUN: %clang -### -c --target=aarch64-windows -fno-plt %s 2>&1 | FileCheck %s --check-prefixes=WARN,DEFAULT + +// WARN: warning: argument unused during compilation: '-fno-plt' [-Wunused-command-line-argument] +// NOPLT: "-fno-plt" +// DEFAULT-NOT: "-fno-plt" From 082b148041ec8bb5024246da3a33e8246dc5e01b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 23 Jan 2025 09:00:54 -0800 Subject: [PATCH 171/208] [TableGen] Pass CodeGenProcModel reference instead of index to addWriteRes/addReadAdvance. NFC 2 of the 3 callers of each of these already had a reference they converted to index. Use that reference and make the one caller that only has an index responsible for looking up the reference from it. --- .../utils/TableGen/Common/CodeGenSchedule.cpp | 28 +++++++++---------- llvm/utils/TableGen/Common/CodeGenSchedule.h | 4 +-- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 2a42262f865cb9..8919a278f352bf 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -1849,21 +1849,21 @@ void CodeGenSchedModels::collectProcResources() { // Add resources separately defined by each subtarget. for (const Record *WR : Records.getAllDerivedDefinitions("WriteRes")) { const Record *ModelDef = WR->getValueAsDef("SchedModel"); - addWriteRes(WR, getProcModel(ModelDef).Index); + addWriteRes(WR, getProcModel(ModelDef)); } for (const Record *SWR : Records.getAllDerivedDefinitions("SchedWriteRes")) { const Record *ModelDef = SWR->getValueAsDef("SchedModel"); - addWriteRes(SWR, getProcModel(ModelDef).Index); + addWriteRes(SWR, getProcModel(ModelDef)); } for (const Record *RA : Records.getAllDerivedDefinitions("ReadAdvance")) { const Record *ModelDef = RA->getValueAsDef("SchedModel"); - addReadAdvance(RA, getProcModel(ModelDef).Index); + addReadAdvance(RA, getProcModel(ModelDef)); } for (const Record *SRA : Records.getAllDerivedDefinitions("SchedReadAdvance")) { if (SRA->getValueInit("SchedModel")->isComplete()) { const Record *ModelDef = SRA->getValueAsDef("SchedModel"); - addReadAdvance(SRA, getProcModel(ModelDef).Index); + addReadAdvance(SRA, getProcModel(ModelDef)); } } // Add ProcResGroups that are defined within this processor model, which may @@ -2005,10 +2005,10 @@ void CodeGenSchedModels::collectRWResources(unsigned RWIdx, bool IsRead, if (SchedRW.TheDef) { if (!IsRead && SchedRW.TheDef->isSubClassOf("SchedWriteRes")) { for (unsigned Idx : ProcIndices) - addWriteRes(SchedRW.TheDef, Idx); + addWriteRes(SchedRW.TheDef, ProcModels[Idx]); } else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) { for (unsigned Idx : ProcIndices) - addReadAdvance(SchedRW.TheDef, Idx); + addReadAdvance(SchedRW.TheDef, ProcModels[Idx]); } } for (auto *Alias : SchedRW.Aliases) { @@ -2104,16 +2104,14 @@ void CodeGenSchedModels::addProcResource(const Record *ProcResKind, // Add resources for a SchedWrite to this processor if they don't exist. void CodeGenSchedModels::addWriteRes(const Record *ProcWriteResDef, - unsigned PIdx) { - assert(PIdx && "don't add resources to an invalid Processor model"); - - ConstRecVec &WRDefs = ProcModels[PIdx].WriteResDefs; + CodeGenProcModel &PM) { + ConstRecVec &WRDefs = PM.WriteResDefs; if (is_contained(WRDefs, ProcWriteResDef)) return; WRDefs.push_back(ProcWriteResDef); if (ProcWriteResDef->isSubClassOf("WriteRes")) { - auto &WRMap = ProcModels[PIdx].WriteResMap; + auto &WRMap = PM.WriteResMap; const Record *WRDef = ProcWriteResDef->getValueAsDef("WriteType"); if (!WRMap.try_emplace(WRDef, ProcWriteResDef).second) PrintFatalError(ProcWriteResDef->getLoc(), @@ -2123,13 +2121,13 @@ void CodeGenSchedModels::addWriteRes(const Record *ProcWriteResDef, // Visit ProcResourceKinds referenced by the newly discovered WriteRes. for (const Record *ProcResDef : ProcWriteResDef->getValueAsListOfDefs("ProcResources")) { - addProcResource(ProcResDef, ProcModels[PIdx], ProcWriteResDef->getLoc()); + addProcResource(ProcResDef, PM, ProcWriteResDef->getLoc()); } } // Add resources for a ReadAdvance to this processor if they don't exist. void CodeGenSchedModels::addReadAdvance(const Record *ProcReadAdvanceDef, - unsigned PIdx) { + CodeGenProcModel &PM) { for (const Record *ValidWrite : ProcReadAdvanceDef->getValueAsListOfDefs("ValidWrites")) if (getSchedRWIdx(ValidWrite, /*IsRead=*/false) == 0) @@ -2139,13 +2137,13 @@ void CodeGenSchedModels::addReadAdvance(const Record *ProcReadAdvanceDef, "any instruction (" + ValidWrite->getName() + ")"); - ConstRecVec &RADefs = ProcModels[PIdx].ReadAdvanceDefs; + ConstRecVec &RADefs = PM.ReadAdvanceDefs; if (is_contained(RADefs, ProcReadAdvanceDef)) return; RADefs.push_back(ProcReadAdvanceDef); if (ProcReadAdvanceDef->isSubClassOf("ReadAdvance")) { - auto &RAMap = ProcModels[PIdx].ReadAdvanceMap; + auto &RAMap = PM.ReadAdvanceMap; const Record *RADef = ProcReadAdvanceDef->getValueAsDef("ReadType"); if (!RAMap.try_emplace(RADef, ProcReadAdvanceDef).second) PrintFatalError(ProcReadAdvanceDef->getLoc(), diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.h b/llvm/utils/TableGen/Common/CodeGenSchedule.h index 467b77e8acba31..5d5aa44d882e59 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.h +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.h @@ -653,9 +653,9 @@ class CodeGenSchedModels { void addProcResource(const Record *ProcResourceKind, CodeGenProcModel &PM, ArrayRef Loc); - void addWriteRes(const Record *ProcWriteResDef, unsigned PIdx); + void addWriteRes(const Record *ProcWriteResDef, CodeGenProcModel &PM); - void addReadAdvance(const Record *ProcReadAdvanceDef, unsigned PIdx); + void addReadAdvance(const Record *ProcReadAdvanceDef, CodeGenProcModel &PM); }; } // namespace llvm From 1a8f49fdda5b14ccc894aacee653f19130df3a30 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Thu, 23 Jan 2025 09:18:12 -0800 Subject: [PATCH 172/208] [mlir][python][cmake] Allow skipping nanobind compile options changes. (#123997) Context: https://github.com/llvm/llvm-project/pull/107103#discussion_r1925834532 This code is brittle, especially when called from a superproject that adds the `nanobind-*` target in a different source directory: ```cmake get_property(all_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS) ``` The changes here do help with my downstream build, but I'm not sure if using the `MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES` option introduced in https://github.com/llvm/llvm-project/pull/117934 is the right fix given that the option is currently scoped directly to one location with a matching name: https://github.com/llvm/llvm-project/blob/7ad8a3da4771ce8abbd146611124104d42a4e63e/mlir/cmake/modules/MLIRDetectPythonEnv.cmake#L4-L5 Some other solutions to consider: 1. Search through an explicit list of target names using `if (TARGET)` 2. Iterate over _all_ targets in the project, not just the targets in the current directory, using code like https://stackoverflow.com/a/62311397 3. Iterate over targets in the directory known to MLIR (`llvm-project/mlir/python`) 4. Move this `target_compile_options` setup into `mlir_configure_python_dev_packages` (I started on this, but that runs into similar issues where the target is defined in a different directory) --- mlir/cmake/modules/AddMLIRPython.cmake | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake index 3f5f2a35f8fb2e..a23de004eb014f 100644 --- a/mlir/cmake/modules/AddMLIRPython.cmake +++ b/mlir/cmake/modules/AddMLIRPython.cmake @@ -671,8 +671,11 @@ function(add_mlir_python_extension libname extname) ${ARG_SOURCES} ) - if (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL) - # Avoids warnings from upstream nanobind. + if (NOT MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES + AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)) + # Avoid some warnings from upstream nanobind. + # If a superproject set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES, let + # the super project handle compile options as it wishes. set(nanobind_target "nanobind-static") if (NOT TARGET ${nanobind_target}) # Get correct nanobind target name: nanobind-static-ft or something else From 8c138bee6e5afc963e77644a1b92e4b228ca34ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 23 Jan 2025 09:24:06 -0800 Subject: [PATCH 173/208] [flang][cuda] Handle pointer allocation with source (#124070) --- flang/include/flang/Runtime/CUDA/pointer.h | 7 +++++++ .../Optimizer/Transforms/CUFOpConversion.cpp | 9 +++++---- flang/runtime/CUDA/pointer.cpp | 15 +++++++++++++++ flang/test/Fir/CUDA/cuda-allocate.fir | 18 ++++++++++++++++++ 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h index db5242696303f5..2197d85f4b93e5 100644 --- a/flang/include/flang/Runtime/CUDA/pointer.h +++ b/flang/include/flang/Runtime/CUDA/pointer.h @@ -21,6 +21,13 @@ int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1, bool hasStat = false, const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, int sourceLine = 0); +/// Perform allocation of the descriptor without synchronization. Assign data +/// from source. +int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer, + const Descriptor &source, int64_t stream = -1, bool hasStat = false, + const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, + int sourceLine = 0); + } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 23248f6d12622a..b0d6b0f0993a61 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -189,11 +189,12 @@ struct CUFAllocateOpConversion mlir::func::FuncOp func; if (op.getSource()) { - if (isPointer) - TODO(loc, "pointer allocation with source"); func = - fir::runtime::getRuntimeFunc( - loc, builder); + isPointer + ? fir::runtime::getRuntimeFunc( + loc, builder) + : fir::runtime::getRuntimeFunc(loc, builder); } else { func = isPointer diff --git a/flang/runtime/CUDA/pointer.cpp b/flang/runtime/CUDA/pointer.cpp index 0c5d3a5a6297d8..35f373b0a56c37 100644 --- a/flang/runtime/CUDA/pointer.cpp +++ b/flang/runtime/CUDA/pointer.cpp @@ -7,8 +7,10 @@ //===----------------------------------------------------------------------===// #include "flang/Runtime/CUDA/pointer.h" +#include "../assign-impl.h" #include "../stat.h" #include "../terminator.h" +#include "flang/Runtime/CUDA/memmove-function.h" #include "flang/Runtime/pointer.h" #include "cuda_runtime.h" @@ -33,6 +35,19 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat, return stat; } +int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer, + const Descriptor &source, int64_t stream, bool hasStat, + const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + int stat{RTNAME(CUFPointerAllocate)( + pointer, stream, hasStat, errMsg, sourceFile, sourceLine)}; + if (stat == StatOk) { + Terminator terminator{sourceFile, sourceLine}; + Fortran::runtime::DoFromSourceAssign( + pointer, source, terminator, &MemmoveHostToDevice); + } + return stat; +} + RT_EXT_API_GROUP_END } // extern "C" diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir index 2ac9498d355414..804bb8636685d1 100644 --- a/flang/test/Fir/CUDA/cuda-allocate.fir +++ b/flang/test/Fir/CUDA/cuda-allocate.fir @@ -192,4 +192,22 @@ func.func @_QPp_alloc() { // CHECK-LABEL: func.func @_QPp_alloc() // CHECK: fir.call @_FortranACUFPointerAllocate +func.func @_QPpointer_source() { + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFpointer_sourceEa"} + %4 = fir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFpointer_sourceEa"} : (!fir.ref>>>) -> !fir.ref>>> + %5 = cuf.alloc !fir.box>> {bindc_name = "a_d", data_attr = #cuf.cuda, uniq_name = "_QFpointer_sourceEa_d"} -> !fir.ref>>> + %7 = fir.declare %5 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFpointer_sourceEa_d"} : (!fir.ref>>>) -> !fir.ref>>> + %8 = fir.load %4 : !fir.ref>>> + %22 = cuf.allocate %7 : !fir.ref>>> source(%8 : !fir.box>>) {data_attr = #cuf.cuda} -> i32 + return +} + +// CHECK-LABEL: func.func @_QPpointer_source() +// CHECK: _FortranACUFPointerAllocateSource + } // end of module From caf0540b91b0fee31353dc7049ae836e0f814cff Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Thu, 23 Jan 2025 17:24:57 +0000 Subject: [PATCH 174/208] [LoopVectorizer] Add support for chaining partial reductions (#120272) Chaining partial reductions, where multiple partial reductions share an accumulator, allow for more values to be combined together as part of the reduction without discarding the semantics of the partial reduction itself. --- .../Transforms/Vectorize/LoopVectorize.cpp | 64 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 4 +- llvm/lib/Transforms/Vectorize/VPlan.h | 4 +- .../AArch64/partial-reduce-chained.ll | 1025 +++++++++++++++++ 4 files changed, 1072 insertions(+), 25 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7167e2179af535..dec7a87ba9c50b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8682,12 +8682,12 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, /// are valid so recipes can be formed later. void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { // Find all possible partial reductions. - SmallVector, 1> + SmallVector> PartialReductionChains; - for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) - if (std::optional> Pair = - getScaledReduction(Phi, RdxDesc, Range)) - PartialReductionChains.push_back(*Pair); + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { + if (auto SR = getScaledReduction(Phi, RdxDesc.getLoopExitInstr(), Range)) + PartialReductionChains.append(*SR); + } // A partial reduction is invalid if any of its extends are used by // something that isn't another partial reduction. This is because the @@ -8715,26 +8715,44 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { } } -std::optional> -VPRecipeBuilder::getScaledReduction(PHINode *PHI, - const RecurrenceDescriptor &Rdx, +std::optional>> +VPRecipeBuilder::getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range) { + + if (!CM.TheLoop->contains(RdxExitInstr)) + return std::nullopt; + // TODO: Allow scaling reductions when predicating. The select at // the end of the loop chooses between the phi value and most recent // reduction result, both of which have different VFs to the active lane // mask when scaling. - if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) + if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent())) return std::nullopt; - auto *Update = dyn_cast(Rdx.getLoopExitInstr()); + auto *Update = dyn_cast(RdxExitInstr); if (!Update) return std::nullopt; Value *Op = Update->getOperand(0); Value *PhiOp = Update->getOperand(1); - if (Op == PHI) { - Op = Update->getOperand(1); - PhiOp = Update->getOperand(0); + if (Op == PHI) + std::swap(Op, PhiOp); + + SmallVector> Chains; + + // Try and get a scaled reduction from the first non-phi operand. + // If one is found, we use the discovered reduction instruction in + // place of the accumulator for costing. + if (auto *OpInst = dyn_cast(Op)) { + if (auto SR0 = getScaledReduction(PHI, OpInst, Range)) { + Chains.append(*SR0); + PHI = SR0->rbegin()->first.Reduction; + + Op = Update->getOperand(0); + PhiOp = Update->getOperand(1); + if (Op == PHI) + std::swap(Op, PhiOp); + } } if (PhiOp != PHI) return std::nullopt; @@ -8757,7 +8775,7 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI, TTI::PartialReductionExtendKind OpBExtend = TargetTransformInfo::getPartialReductionExtendKind(ExtB); - PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); + PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp); unsigned TargetScaleFactor = PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( @@ -8772,9 +8790,9 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI, return Cost.isValid(); }, Range)) - return std::make_pair(Chain, TargetScaleFactor); + Chains.push_back(std::make_pair(Chain, TargetScaleFactor)); - return std::nullopt; + return Chains; } VPRecipeBase * @@ -8869,12 +8887,14 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Unexpected number of operands for partial reduction"); VPValue *BinOp = Operands[0]; - VPValue *Phi = Operands[1]; - if (isa(BinOp->getDefiningRecipe())) - std::swap(BinOp, Phi); - - return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, - Reduction); + VPValue *Accumulator = Operands[1]; + VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); + if (isa(BinOpRecipe) || + isa(BinOpRecipe)) + std::swap(BinOp, Accumulator); + + return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, + Accumulator, Reduction); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 44745bfd46f891..9b1f40d0560bc2 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -142,8 +142,8 @@ class VPRecipeBuilder { /// Returns null if no scaled reduction was found, otherwise a pair with a /// struct containing reduction information and the scaling factor between the /// number of elements in the input and output. - std::optional> - getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx, + std::optional>> + getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range); public: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 8d3a2eaee2eff0..42b35e8b57c07d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2461,7 +2461,9 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef({Op0, Op1}), ReductionInst), Opcode(Opcode) { - assert(isa(getOperand(1)->getDefiningRecipe()) && + auto *AccumulatorRecipe = getOperand(1)->getDefiningRecipe(); + assert((isa(AccumulatorRecipe) || + isa(AccumulatorRecipe)) && "Unexpected operand order for partial reduction recipe"); } ~VPPartialReductionRecipe() override = default; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll new file mode 100644 index 00000000000000..bedf8b6b3a9b56 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -0,0 +1,1025 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON +; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE +; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16) +define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_sub( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %add = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %sub = sub i32 %add, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP17]]) +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %add.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %add = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add.2 = add i32 %add, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %add, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add i32 %sub, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_sub( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %sub.2 = sub i32 %sub, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add_add( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]]) +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP21]] = add [[TMP19]], [[TMP20]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP17]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE3]], [[TMP18]]) +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add nsw i32 %sub, %mul.ac + %mul.bc = mul nsw i32 %b.ext, %c.ext + %sub.2 = add i32 %add, %mul.bc + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add_sub( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP15]] = sub <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP21]] = sub [[TMP19]], [[TMP20]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP21]] = sub [[TMP19]], [[TMP20]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP21]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add nsw i32 %sub, %mul.ac + %mul.bc = mul nsw i32 %b.ext, %c.ext + %sub.2 = sub i32 %add, %mul.bc + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +attributes #0 = { vscale_range(1,16) } + + +!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!1 = distinct !{!0} From a2453097e3b4010162efacb4e7edcb121da8607f Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Thu, 23 Jan 2025 09:30:23 -0800 Subject: [PATCH 175/208] [llvm-profdata] Add block percent to detailed summary (#105915) --- .../Linux/Inputs/instrprof-value-merge.c | 2 + compiler-rt/test/profile/Linux/binary-id.c | 10 +++ .../test/profile/Linux/profile-version.c | 4 ++ llvm/lib/IR/ProfileSummary.cpp | 11 +-- .../cs-sample-nested-profile.test | 68 +++++++++---------- .../test/tools/llvm-profdata/general.proftext | 22 +++--- .../tools/llvm-profdata/sample-summary.test | 34 +++++----- .../suppl-instr-with-sample.test | 32 ++++----- .../llvm-profdata/vtable-value-prof.test | 14 ++-- llvm/tools/llvm-profdata/llvm-profdata.cpp | 9 +-- 10 files changed, 110 insertions(+), 96 deletions(-) diff --git a/compiler-rt/test/profile/Linux/Inputs/instrprof-value-merge.c b/compiler-rt/test/profile/Linux/Inputs/instrprof-value-merge.c index 6f5b4790cca564..3efbcf26e81ff7 100644 --- a/compiler-rt/test/profile/Linux/Inputs/instrprof-value-merge.c +++ b/compiler-rt/test/profile/Linux/Inputs/instrprof-value-merge.c @@ -54,6 +54,8 @@ int main(int argc, char *argv[]) { // CHECK-NEXT: Total functions: 3 // CHECK-NEXT: Maximum function count: 327 // CHECK-NEXT: Maximum internal block count: 297 +// CHECK-NEXT: Total number of blocks: 8 +// CHECK-NEXT: Total count: 805 // CHECK-NEXT: Statistics for indirect call sites profile: // CHECK-NEXT: Total number of sites: 3 // CHECK-NEXT: Total number of sites with values: 3 diff --git a/compiler-rt/test/profile/Linux/binary-id.c b/compiler-rt/test/profile/Linux/binary-id.c index 9bff9004e97a7e..93a6852e626eb1 100644 --- a/compiler-rt/test/profile/Linux/binary-id.c +++ b/compiler-rt/test/profile/Linux/binary-id.c @@ -62,6 +62,8 @@ int main() { // BINARY-ID-RAW-PROF-NEXT: Total functions: 3 // BINARY-ID-RAW-PROF-NEXT: Maximum function count: 1 // BINARY-ID-RAW-PROF-NEXT: Maximum internal block count: 0 +// BINARY-ID-RAW-PROF-NEXT: Total number of blocks: 3 +// BINARY-ID-RAW-PROF-NEXT: Total count: 3 // BINARY-ID-RAW-PROF-NEXT: Binary IDs: // BINARY-ID-RAW-PROF-NEXT: {{[0-9a-f]+}} @@ -69,6 +71,8 @@ int main() { // BINARY-ID-MERGE-PROF-NEXT: Total functions: 3 // BINARY-ID-MERGE-PROF-NEXT: Maximum function count: 3 // BINARY-ID-MERGE-PROF-NEXT: Maximum internal block count: 0 +// BINARY-ID-MERGE-PROF-NEXT: Total number of blocks: 3 +// BINARY-ID-MERGE-PROF-NEXT: Total count: 9 // BINARY-ID-MERGE-PROF-NEXT: Binary IDs: // BINARY-ID-MERGE-PROF-NEXT: {{[0-9a-f]+}} @@ -76,6 +80,8 @@ int main() { // BINARY-ID-INDEXED-PROF-NEXT: Total functions: 3 // BINARY-ID-INDEXED-PROF-NEXT: Maximum function count: 3 // BINARY-ID-INDEXED-PROF-NEXT: Maximum internal block count: 0 +// BINARY-ID-INDEXED-PROF-NEXT: Total number of blocks: 3 +// BINARY-ID-INDEXED-PROF-NEXT: Total count: 9 // BINARY-ID-INDEXED-PROF-NEXT: Binary IDs: // BINARY-ID-INDEXED-PROF-NEXT: {{[0-9a-f]+}} @@ -83,6 +89,8 @@ int main() { // BINARY-ID-SHARE-RAW-PROF-NEXT: Total functions: 3 // BINARY-ID-SHARE-RAW-PROF-NEXT: Maximum function count: 1 // BINARY-ID-SHARE-RAW-PROF-NEXT: Maximum internal block count: 0 +// BINARY-ID-SHARE-RAW-PROF-NEXT: Total number of blocks: 3 +// BINARY-ID-SHARE-RAW-PROF-NEXT: Total count: 3 // BINARY-ID-SHARE-RAW-PROF-NEXT: Binary IDs: // BINARY-ID-SHARE-RAW-PROF-NEXT: {{[0-9a-f]+}} // BINARY-ID-SHARE-RAW-PROF-NEXT: {{[0-9a-f]+}} @@ -92,6 +100,8 @@ int main() { // BINARY-ID-SHARE-INDEXED-PROF-NEXT: Total functions: 3 // BINARY-ID-SHARE-INDEXED-PROF-NEXT: Maximum function count: 1 // BINARY-ID-SHARE-INDEXED-PROF-NEXT: Maximum internal block count: 0 +// BINARY-ID-SHARE-INDEXED-PROF-NEXT: Total number of blocks: 3 +// BINARY-ID-SHARE-INDEXED-PROF-NEXT: Total count: 3 // BINARY-ID-SHARE-INDEXED-PROF-NEXT: Binary IDs: // BINARY-ID-SHARE-INDEXED-PROF-NEXT: {{[0-9a-f]+}} // BINARY-ID-SHARE-INDEXED-PROF-NEXT: {{[0-9a-f]+}} diff --git a/compiler-rt/test/profile/Linux/profile-version.c b/compiler-rt/test/profile/Linux/profile-version.c index bffc602387ad50..7e20d1bee6cc5e 100644 --- a/compiler-rt/test/profile/Linux/profile-version.c +++ b/compiler-rt/test/profile/Linux/profile-version.c @@ -25,10 +25,14 @@ int main() { // RAW-PROF-NEXT: Total functions: 3 // RAW-PROF-NEXT: Maximum function count: 1 // RAW-PROF-NEXT: Maximum internal block count: 0 +// RAW-PROF-NEXT: Total number of blocks: 3 +// RAW-PROF-NEXT: Total count: 3 // RAW-PROF-NEXT: Profile version: {{[0-9]+}} // INDEXED-PROF: Instrumentation level: Front-end // INDEXED-PROF-NEXT: Total functions: 3 // INDEXED-PROF-NEXT: Maximum function count: 3 // INDEXED-PROF-NEXT: Maximum internal block count: 0 +// INDEXED-PROF-NEXT: Total number of blocks: 3 +// INDEXED-PROF-NEXT: Total count: 9 // INDEXED-PROF-NEXT: Profile version: {{[0-9]+}} diff --git a/llvm/lib/IR/ProfileSummary.cpp b/llvm/lib/IR/ProfileSummary.cpp index acb4c52e8918fc..59f29e5982dfb0 100644 --- a/llvm/lib/IR/ProfileSummary.cpp +++ b/llvm/lib/IR/ProfileSummary.cpp @@ -251,7 +251,7 @@ ProfileSummary *ProfileSummary::getFromMD(Metadata *MD) { void ProfileSummary::printSummary(raw_ostream &OS) const { OS << "Total functions: " << NumFunctions << "\n"; OS << "Maximum function count: " << MaxFunctionCount << "\n"; - OS << "Maximum block count: " << MaxCount << "\n"; + OS << "Maximum internal block count: " << MaxInternalCount << "\n"; OS << "Total number of blocks: " << NumCounts << "\n"; OS << "Total count: " << TotalCount << "\n"; } @@ -259,9 +259,10 @@ void ProfileSummary::printSummary(raw_ostream &OS) const { void ProfileSummary::printDetailedSummary(raw_ostream &OS) const { OS << "Detailed summary:\n"; for (const auto &Entry : DetailedSummary) { - OS << Entry.NumCounts << " blocks with count >= " << Entry.MinCount - << " account for " - << format("%0.6g", (float)Entry.Cutoff / Scale * 100) - << " percentage of the total counts.\n"; + OS << format("%lu blocks (%.2f%%) with count >= %lu account for %0.6g%% of " + "the total counts.\n", + Entry.NumCounts, + NumCounts ? (100.f * Entry.NumCounts / NumCounts) : 0, + Entry.MinCount, 100.f * Entry.Cutoff / Scale); } } diff --git a/llvm/test/tools/llvm-profdata/cs-sample-nested-profile.test b/llvm/test/tools/llvm-profdata/cs-sample-nested-profile.test index 7b01324219115c..d2b07cf05fd240 100644 --- a/llvm/test/tools/llvm-profdata/cs-sample-nested-profile.test +++ b/llvm/test/tools/llvm-profdata/cs-sample-nested-profile.test @@ -153,47 +153,47 @@ RUN: llvm-profdata show -sample -detailed-summary %t3.proftext | FileCheck %s -c ; SUMMARY: Total functions: 4 ; SUMMARY-NEXT: Maximum function count: 32 -; SUMMARY-NEXT: Maximum block count: 362830 +; SUMMARY-NEXT: Maximum internal block count: 0 ; SUMMARY-NEXT: Total number of blocks: 16 ; SUMMARY-NEXT: Total count: 772562 ; SUMMARY-NEXT: Detailed summary: -; SUMMARY-NEXT: 1 blocks with count >= 362830 account for 1 percentage of the total counts. -; SUMMARY-NEXT: 1 blocks with count >= 362830 account for 10 percentage of the total counts. -; SUMMARY-NEXT: 1 blocks with count >= 362830 account for 20 percentage of the total counts. -; SUMMARY-NEXT: 1 blocks with count >= 362830 account for 30 percentage of the total counts. -; SUMMARY-NEXT: 1 blocks with count >= 362830 account for 40 percentage of the total counts. -; SUMMARY-NEXT: 2 blocks with count >= 362805 account for 50 percentage of the total counts. -; SUMMARY-NEXT: 2 blocks with count >= 362805 account for 60 percentage of the total counts. -; SUMMARY-NEXT: 2 blocks with count >= 362805 account for 70 percentage of the total counts. -; SUMMARY-NEXT: 2 blocks with count >= 362805 account for 80 percentage of the total counts. -; SUMMARY-NEXT: 2 blocks with count >= 362805 account for 90 percentage of the total counts. -; SUMMARY-NEXT: 3 blocks with count >= 23327 account for 95 percentage of the total counts. -; SUMMARY-NEXT: 4 blocks with count >= 23324 account for 99 percentage of the total counts. -; SUMMARY-NEXT: 4 blocks with count >= 23324 account for 99.9 percentage of the total counts. -; SUMMARY-NEXT: 11 blocks with count >= 24 account for 99.99 percentage of the total counts. -; SUMMARY-NEXT: 16 blocks with count >= 10 account for 99.999 percentage of the total counts. -; SUMMARY-NEXT: 16 blocks with count >= 10 account for 99.9999 percentage of the total counts. +; SUMMARY-NEXT: 1 blocks (6.25%) with count >= 362830 account for 1% of the total counts. +; SUMMARY-NEXT: 1 blocks (6.25%) with count >= 362830 account for 10% of the total counts. +; SUMMARY-NEXT: 1 blocks (6.25%) with count >= 362830 account for 20% of the total counts. +; SUMMARY-NEXT: 1 blocks (6.25%) with count >= 362830 account for 30% of the total counts. +; SUMMARY-NEXT: 1 blocks (6.25%) with count >= 362830 account for 40% of the total counts. +; SUMMARY-NEXT: 2 blocks (12.50%) with count >= 362805 account for 50% of the total counts. +; SUMMARY-NEXT: 2 blocks (12.50%) with count >= 362805 account for 60% of the total counts. +; SUMMARY-NEXT: 2 blocks (12.50%) with count >= 362805 account for 70% of the total counts. +; SUMMARY-NEXT: 2 blocks (12.50%) with count >= 362805 account for 80% of the total counts. +; SUMMARY-NEXT: 2 blocks (12.50%) with count >= 362805 account for 90% of the total counts. +; SUMMARY-NEXT: 3 blocks (18.75%) with count >= 23327 account for 95% of the total counts. +; SUMMARY-NEXT: 4 blocks (25.00%) with count >= 23324 account for 99% of the total counts. +; SUMMARY-NEXT: 4 blocks (25.00%) with count >= 23324 account for 99.9% of the total counts. +; SUMMARY-NEXT: 11 blocks (68.75%) with count >= 24 account for 99.99% of the total counts. +; SUMMARY-NEXT: 16 blocks (100.00%) with count >= 10 account for 99.999% of the total counts. +; SUMMARY-NEXT: 16 blocks (100.00%) with count >= 10 account for 99.9999% of the total counts. ; SUMMARY-NEST: Total functions: 4 ; SUMMARY-NEST-NEXT: Maximum function count: 32 -; SUMMARY-NEST-NEXT: Maximum block count: 362830 +; SUMMARY-NEST-NEXT: Maximum internal block count: 0 ; SUMMARY-NEST-NEXT: Total number of blocks: 15 ; SUMMARY-NEST-NEXT: Total count: 772504 ; SUMMARY-NEST-NEXT: Detailed summary: -; SUMMARY-NEST-NEXT: 1 blocks with count >= 362830 account for 1 percentage of the total counts. -; SUMMARY-NEST-NEXT: 1 blocks with count >= 362830 account for 10 percentage of the total counts. -; SUMMARY-NEST-NEXT: 1 blocks with count >= 362830 account for 20 percentage of the total counts. -; SUMMARY-NEST-NEXT: 1 blocks with count >= 362830 account for 30 percentage of the total counts. -; SUMMARY-NEST-NEXT: 1 blocks with count >= 362830 account for 40 percentage of the total counts. -; SUMMARY-NEST-NEXT: 2 blocks with count >= 362805 account for 50 percentage of the total counts. -; SUMMARY-NEST-NEXT: 2 blocks with count >= 362805 account for 60 percentage of the total counts. -; SUMMARY-NEST-NEXT: 2 blocks with count >= 362805 account for 70 percentage of the total counts. -; SUMMARY-NEST-NEXT: 2 blocks with count >= 362805 account for 80 percentage of the total counts. -; SUMMARY-NEST-NEXT: 2 blocks with count >= 362805 account for 90 percentage of the total counts. -; SUMMARY-NEST-NEXT: 3 blocks with count >= 23327 account for 95 percentage of the total counts. -; SUMMARY-NEST-NEXT: 4 blocks with count >= 23324 account for 99 percentage of the total counts. -; SUMMARY-NEST-NEXT: 4 blocks with count >= 23324 account for 99.9 percentage of the total counts. -; SUMMARY-NEST-NEXT: 10 blocks with count >= 21 account for 99.99 percentage of the total counts. -; SUMMARY-NEST-NEXT: 15 blocks with count >= 10 account for 99.999 percentage of the total counts. -; SUMMARY-NEST-NEXT: 15 blocks with count >= 10 account for 99.9999 percentage of the total counts. +; SUMMARY-NEST-NEXT: 1 blocks (6.67%) with count >= 362830 account for 1% of the total counts. +; SUMMARY-NEST-NEXT: 1 blocks (6.67%) with count >= 362830 account for 10% of the total counts. +; SUMMARY-NEST-NEXT: 1 blocks (6.67%) with count >= 362830 account for 20% of the total counts. +; SUMMARY-NEST-NEXT: 1 blocks (6.67%) with count >= 362830 account for 30% of the total counts. +; SUMMARY-NEST-NEXT: 1 blocks (6.67%) with count >= 362830 account for 40% of the total counts. +; SUMMARY-NEST-NEXT: 2 blocks (13.33%) with count >= 362805 account for 50% of the total counts. +; SUMMARY-NEST-NEXT: 2 blocks (13.33%) with count >= 362805 account for 60% of the total counts. +; SUMMARY-NEST-NEXT: 2 blocks (13.33%) with count >= 362805 account for 70% of the total counts. +; SUMMARY-NEST-NEXT: 2 blocks (13.33%) with count >= 362805 account for 80% of the total counts. +; SUMMARY-NEST-NEXT: 2 blocks (13.33%) with count >= 362805 account for 90% of the total counts. +; SUMMARY-NEST-NEXT: 3 blocks (20.00%) with count >= 23327 account for 95% of the total counts. +; SUMMARY-NEST-NEXT: 4 blocks (26.67%) with count >= 23324 account for 99% of the total counts. +; SUMMARY-NEST-NEXT: 4 blocks (26.67%) with count >= 23324 account for 99.9% of the total counts. +; SUMMARY-NEST-NEXT: 10 blocks (66.67%) with count >= 21 account for 99.99% of the total counts. +; SUMMARY-NEST-NEXT: 15 blocks (100.00%) with count >= 10 account for 99.999% of the total counts. +; SUMMARY-NEST-NEXT: 15 blocks (100.00%) with count >= 10 account for 99.9999% of the total counts. diff --git a/llvm/test/tools/llvm-profdata/general.proftext b/llvm/test/tools/llvm-profdata/general.proftext index 2dfb8e8b34d57e..89762f2540f6a6 100644 --- a/llvm/test/tools/llvm-profdata/general.proftext +++ b/llvm/test/tools/llvm-profdata/general.proftext @@ -71,18 +71,18 @@ hex_hash # DETAILED-SUMMARY: Total number of blocks: 10 # DETAILED-SUMMARY: Total count: 4539628424389557499 # DETAILED-SUMMARY: Detailed summary: -# DETAILED-SUMMARY: 3 blocks with count >= 576460752303423488 account for 80 percentage of the total counts. -# DETAILED-SUMMARY: 4 blocks with count >= 288230376151711744 account for 90 percentage of the total counts. -# DETAILED-SUMMARY: 4 blocks with count >= 288230376151711744 account for 95 percentage of the total counts. -# DETAILED-SUMMARY: 6 blocks with count >= 72057594037927936 account for 99 percentage of the total counts. -# DETAILED-SUMMARY: 6 blocks with count >= 72057594037927936 account for 99.9 percentage of the total counts. -# DETAILED-SUMMARY: 6 blocks with count >= 72057594037927936 account for 99.99 percentage of the total counts. -# DETAILED-SUMMARY: 6 blocks with count >= 72057594037927936 account for 99.999 percentage of the total counts. +# DETAILED-SUMMARY: 3 blocks (30.00%) with count >= 576460752303423488 account for 80% of the total counts. +# DETAILED-SUMMARY: 4 blocks (40.00%) with count >= 288230376151711744 account for 90% of the total counts. +# DETAILED-SUMMARY: 4 blocks (40.00%) with count >= 288230376151711744 account for 95% of the total counts. +# DETAILED-SUMMARY: 6 blocks (60.00%) with count >= 72057594037927936 account for 99% of the total counts. +# DETAILED-SUMMARY: 6 blocks (60.00%) with count >= 72057594037927936 account for 99.9% of the total counts. +# DETAILED-SUMMARY: 6 blocks (60.00%) with count >= 72057594037927936 account for 99.99% of the total counts. +# DETAILED-SUMMARY: 6 blocks (60.00%) with count >= 72057594037927936 account for 99.999% of the total counts. # RUN: llvm-profdata show --detailed-summary --detailed-summary-cutoffs=600000 %t.profdata | FileCheck %s -check-prefix=DETAILED-SUMMARY-2 -# DETAILED-SUMMARY-2: 2 blocks with count >= 1152921504606846976 account for 60 percentage of the total counts. +# DETAILED-SUMMARY-2: 2 blocks (28.57%) with count >= 1152921504606846976 account for 60% of the total counts. # # RUN: llvm-profdata show --detailed-summary --detailed-summary-cutoffs=600000,900000,999999 %t.profdata | FileCheck %s -check-prefix=DETAILED-SUMMARY-3 -# DETAILED-SUMMARY-3: 2 blocks with count >= 1152921504606846976 account for 60 percentage of the total counts. -# DETAILED-SUMMARY-3: 4 blocks with count >= 288230376151711744 account for 90 percentage of the total counts. -# DETAILED-SUMMARY-3: 6 blocks with count >= 72057594037927936 account for 99.9999 percentage of the total counts. +# DETAILED-SUMMARY-3: 2 blocks (28.57%) with count >= 1152921504606846976 account for 60% of the total counts. +# DETAILED-SUMMARY-3: 4 blocks (57.14%) with count >= 288230376151711744 account for 90% of the total counts. +# DETAILED-SUMMARY-3: 6 blocks (85.71%) with count >= 72057594037927936 account for 99.9999% of the total counts. diff --git a/llvm/test/tools/llvm-profdata/sample-summary.test b/llvm/test/tools/llvm-profdata/sample-summary.test index 3326c9bb29806b..310cc8bfb721cd 100644 --- a/llvm/test/tools/llvm-profdata/sample-summary.test +++ b/llvm/test/tools/llvm-profdata/sample-summary.test @@ -2,23 +2,23 @@ ; CHECK: Total functions: 3 ; CHECK-NEXT: Maximum function count: 1437 -; CHECK-NEXT: Maximum block count: 2080 +; CHECK-NEXT: Maximum internal block count: 0 ; CHECK-NEXT: Total number of blocks: 11 ; CHECK-NEXT: Total count: 12943 ; CHECK-NEXT: Detailed summary: -; CHECK-NEXT: 1 blocks with count >= 2080 account for 1 percentage of the total counts. -; CHECK-NEXT: 1 blocks with count >= 2080 account for 10 percentage of the total counts. -; CHECK-NEXT: 2 blocks with count >= 2064 account for 20 percentage of the total counts. -; CHECK-NEXT: 2 blocks with count >= 2064 account for 30 percentage of the total counts. -; CHECK-NEXT: 3 blocks with count >= 2000 account for 40 percentage of the total counts. -; CHECK-NEXT: 4 blocks with count >= 1437 account for 50 percentage of the total counts. -; CHECK-NEXT: 6 blocks with count >= 1075 account for 60 percentage of the total counts. -; CHECK-NEXT: 6 blocks with count >= 1075 account for 70 percentage of the total counts. -; CHECK-NEXT: 7 blocks with count >= 1000 account for 80 percentage of the total counts. -; CHECK-NEXT: 11 blocks with count >= 534 account for 90 percentage of the total counts. -; CHECK-NEXT: 11 blocks with count >= 534 account for 95 percentage of the total counts. -; CHECK-NEXT: 11 blocks with count >= 534 account for 99 percentage of the total counts. -; CHECK-NEXT: 11 blocks with count >= 534 account for 99.9 percentage of the total counts. -; CHECK-NEXT: 11 blocks with count >= 534 account for 99.99 percentage of the total counts. -; CHECK-NEXT: 11 blocks with count >= 534 account for 99.999 percentage of the total counts. -; CHECK-NEXT: 11 blocks with count >= 534 account for 99.9999 percentage of the total counts. +; CHECK-NEXT: 1 blocks (9.09%) with count >= 2080 account for 1% of the total counts. +; CHECK-NEXT: 1 blocks (9.09%) with count >= 2080 account for 10% of the total counts. +; CHECK-NEXT: 2 blocks (18.18%) with count >= 2064 account for 20% of the total counts. +; CHECK-NEXT: 2 blocks (18.18%) with count >= 2064 account for 30% of the total counts. +; CHECK-NEXT: 3 blocks (27.27%) with count >= 2000 account for 40% of the total counts. +; CHECK-NEXT: 4 blocks (36.36%) with count >= 1437 account for 50% of the total counts. +; CHECK-NEXT: 6 blocks (54.55%) with count >= 1075 account for 60% of the total counts. +; CHECK-NEXT: 6 blocks (54.55%) with count >= 1075 account for 70% of the total counts. +; CHECK-NEXT: 7 blocks (63.64%) with count >= 1000 account for 80% of the total counts. +; CHECK-NEXT: 11 blocks (100.00%) with count >= 534 account for 90% of the total counts. +; CHECK-NEXT: 11 blocks (100.00%) with count >= 534 account for 95% of the total counts. +; CHECK-NEXT: 11 blocks (100.00%) with count >= 534 account for 99% of the total counts. +; CHECK-NEXT: 11 blocks (100.00%) with count >= 534 account for 99.9% of the total counts. +; CHECK-NEXT: 11 blocks (100.00%) with count >= 534 account for 99.99% of the total counts. +; CHECK-NEXT: 11 blocks (100.00%) with count >= 534 account for 99.999% of the total counts. +; CHECK-NEXT: 11 blocks (100.00%) with count >= 534 account for 99.9999% of the total counts. diff --git a/llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test b/llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test index 20d4d2198ff449..c22646e45849ea 100644 --- a/llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test +++ b/llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test @@ -98,19 +98,19 @@ MIX5-NEXT: Maximum internal block count: 2000 MIX5-NEXT: Total number of blocks: 9 MIX5-NEXT: Total count: 6525 MIX5-NEXT: Detailed summary: -MIX5-NEXT: 1 blocks with count >= 3000 account for 1 percentage of the total counts. -MIX5-NEXT: 1 blocks with count >= 3000 account for 10 percentage of the total counts. -MIX5-NEXT: 1 blocks with count >= 3000 account for 20 percentage of the total counts. -MIX5-NEXT: 1 blocks with count >= 3000 account for 30 percentage of the total counts. -MIX5-NEXT: 1 blocks with count >= 3000 account for 40 percentage of the total counts. -MIX5-NEXT: 2 blocks with count >= 2000 account for 50 percentage of the total counts. -MIX5-NEXT: 2 blocks with count >= 2000 account for 60 percentage of the total counts. -MIX5-NEXT: 2 blocks with count >= 2000 account for 70 percentage of the total counts. -MIX5-NEXT: 3 blocks with count >= 1000 account for 80 percentage of the total counts. -MIX5-NEXT: 3 blocks with count >= 1000 account for 90 percentage of the total counts. -MIX5-NEXT: 4 blocks with count >= 500 account for 95 percentage of the total counts. -MIX5-NEXT: 4 blocks with count >= 500 account for 99 percentage of the total counts. -MIX5-NEXT: 6 blocks with count >= 12 account for 99.9 percentage of the total counts. -MIX5-NEXT: 6 blocks with count >= 12 account for 99.99 percentage of the total counts. -MIX5-NEXT: 6 blocks with count >= 12 account for 99.999 percentage of the total counts. -MIX5-NEXT: 6 blocks with count >= 12 account for 99.9999 percentage of the total counts. +MIX5-NEXT: 1 blocks (11.11%) with count >= 3000 account for 1% of the total counts. +MIX5-NEXT: 1 blocks (11.11%) with count >= 3000 account for 10% of the total counts. +MIX5-NEXT: 1 blocks (11.11%) with count >= 3000 account for 20% of the total counts. +MIX5-NEXT: 1 blocks (11.11%) with count >= 3000 account for 30% of the total counts. +MIX5-NEXT: 1 blocks (11.11%) with count >= 3000 account for 40% of the total counts. +MIX5-NEXT: 2 blocks (22.22%) with count >= 2000 account for 50% of the total counts. +MIX5-NEXT: 2 blocks (22.22%) with count >= 2000 account for 60% of the total counts. +MIX5-NEXT: 2 blocks (22.22%) with count >= 2000 account for 70% of the total counts. +MIX5-NEXT: 3 blocks (33.33%) with count >= 1000 account for 80% of the total counts. +MIX5-NEXT: 3 blocks (33.33%) with count >= 1000 account for 90% of the total counts. +MIX5-NEXT: 4 blocks (44.44%) with count >= 500 account for 95% of the total counts. +MIX5-NEXT: 4 blocks (44.44%) with count >= 500 account for 99% of the total counts. +MIX5-NEXT: 6 blocks (66.67%) with count >= 12 account for 99.9% of the total counts. +MIX5-NEXT: 6 blocks (66.67%) with count >= 12 account for 99.99% of the total counts. +MIX5-NEXT: 6 blocks (66.67%) with count >= 12 account for 99.999% of the total counts. +MIX5-NEXT: 6 blocks (66.67%) with count >= 12 account for 99.9999% of the total counts. diff --git a/llvm/test/tools/llvm-profdata/vtable-value-prof.test b/llvm/test/tools/llvm-profdata/vtable-value-prof.test index 8dc8f6f0d480ee..5ed4a356e1041b 100644 --- a/llvm/test/tools/llvm-profdata/vtable-value-prof.test +++ b/llvm/test/tools/llvm-profdata/vtable-value-prof.test @@ -1,18 +1,18 @@ -; RUN: rm -rf %t && mkdir %t && cd %t +; RUN: rm -rf %t && mkdir %t ; Generate indexed profiles from text profiles -RUN: llvm-profdata merge --keep-vtable-symbols %S/Inputs/vtable-value-prof.proftext -o indexed.profdata +RUN: llvm-profdata merge --keep-vtable-symbols %S/Inputs/vtable-value-prof.proftext -o %t/indexed.profdata ; Show indexed profiles -RUN: llvm-profdata show --function=main --ic-targets --show-vtables indexed.profdata | FileCheck %s --check-prefix=INDEXED +RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t/indexed.profdata | FileCheck %s --check-prefix=INDEXED ; Show text profiles RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text %S/Inputs/vtable-value-prof.proftext | FileCheck %s --check-prefix=ICTEXT ; Convert indexed profiles to its textual output and show it. -RUN: llvm-profdata merge --keep-vtable-symbols --text -o text-from-indexed.proftext indexed.profdata -RUN: llvm-profdata show --function=main --ic-targets --show-vtables text-from-indexed.proftext | FileCheck %s --check-prefix=INDEXED -RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text text-from-indexed.proftext | FileCheck %s --check-prefix=ICTEXT +RUN: llvm-profdata merge --keep-vtable-symbols --text -o %t/text-from-indexed.proftext %t/indexed.profdata +RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t/text-from-indexed.proftext | FileCheck %s --check-prefix=INDEXED +RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text %t/text-from-indexed.proftext | FileCheck %s --check-prefix=ICTEXT INDEXED: Counters: INDEXED-NEXT: main: @@ -35,6 +35,8 @@ INDEXED-NEXT: Functions shown: 1 INDEXED-NEXT: Total functions: 6 INDEXED-NEXT: Maximum function count: 1000 INDEXED-NEXT: Maximum internal block count: 250 +INDEXED-NEXT: Total number of blocks: 8 +INDEXED-NEXT: Total count: 4001 INDEXED-NEXT: Statistics for indirect call sites profile: INDEXED-NEXT: Total number of sites: 2 INDEXED-NEXT: Total number of sites with values: 2 diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index ffc481f071857a..560210e59eeba6 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -3012,15 +3012,13 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { OS << "\n"; if (ShowAllFunctions || !FuncNameFilter.empty()) OS << "Functions shown: " << ShownFunctions << "\n"; - OS << "Total functions: " << PS->getNumFunctions() << "\n"; + PS->printSummary(OS); if (ShowValueCutoff > 0) { OS << "Number of functions with maximum count (< " << ShowValueCutoff << "): " << BelowCutoffFunctions << "\n"; OS << "Number of functions with maximum count (>= " << ShowValueCutoff << "): " << PS->getNumFunctions() - BelowCutoffFunctions << "\n"; } - OS << "Maximum function count: " << PS->getMaxFunctionCount() << "\n"; - OS << "Maximum internal block count: " << PS->getMaxInternalCount() << "\n"; if (TopNFunctions) { std::vector> SortedHottestFuncs; @@ -3050,11 +3048,8 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { showValueSitesStats(OS, IPVK_MemOPSize, VPStats[IPVK_MemOPSize]); } - if (ShowDetailedSummary) { - OS << "Total number of blocks: " << PS->getNumCounts() << "\n"; - OS << "Total count: " << PS->getTotalCount() << "\n"; + if (ShowDetailedSummary) PS->printDetailedSummary(OS); - } if (ShowBinaryIds) if (Error E = Reader->printBinaryIds(OS)) From e00f1f843610416f18a2fe4779c19310e808a1a4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 23 Jan 2025 12:32:54 -0500 Subject: [PATCH 176/208] [ELF] Error for executable .note.GNU-stack unless -z execstack or -r .note.GNU-stack with the SHF_EXECINSTR flag requires an executable stack. This is exceedingly rare. We report an error to force the user to explicitly request an executable stack. Close #121234 Pull Request: https://github.com/llvm/llvm-project/pull/124068 --- lld/ELF/InputFiles.cpp | 13 +++++++++++-- lld/test/ELF/gnustack.s | 35 ++++++++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index c44773d0b7dabe..c3c6812c262028 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -1025,9 +1025,18 @@ InputSectionBase *ObjFile::createInputSection(uint32_t idx, // Therefore, we make LLD always add PT_GNU_STACK unless it is // explicitly told to do otherwise (by -z execstack). Because the stack // executable-ness is controlled solely by command line options, - // .note.GNU-stack sections are simply ignored. - if (name == ".note.GNU-stack") + // .note.GNU-stack sections are, with one exception, ignored. Report + // an error if we encounter an executable .note.GNU-stack to force the + // user to explicitly request an executable stack. + if (name == ".note.GNU-stack") { + if ((sec.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable && + ctx.arg.zGnustack != GnuStackKind::Exec) { + Err(ctx) << this + << ": requires an executable stack, but -z execstack is not " + "specified"; + } return &InputSection::discarded; + } // Object files that use processor features such as Intel Control-Flow // Enforcement (CET) or AArch64 Branch Target Identification BTI, use a diff --git a/lld/test/ELF/gnustack.s b/lld/test/ELF/gnustack.s index 828e09328c892c..29e81538b6cab8 100644 --- a/lld/test/ELF/gnustack.s +++ b/lld/test/ELF/gnustack.s @@ -1,17 +1,20 @@ # REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1 +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o +# RUN: llvm-mc -filetype=obj -triple=x86_64 x.s -o x.o +# RUN: llvm-mc -filetype=obj -triple=x86_64 nox.s -o nox.o -# RUN: ld.lld %t1 -z execstack -o %t -# RUN: llvm-readobj --program-headers -S %t | FileCheck --check-prefix=RWX %s +# RUN: ld.lld a.o -z execstack -o out +# RUN: llvm-readobj --program-headers -S out | FileCheck --check-prefix=RWX %s -# RUN: ld.lld %t1 -o %t -# RUN: llvm-readobj --program-headers -S %t | FileCheck --check-prefix=RW %s +# RUN: ld.lld a.o -o out +# RUN: llvm-readobj --program-headers -S out | FileCheck --check-prefix=RW %s -# RUN: ld.lld %t1 -o %t -z noexecstack -# RUN: llvm-readobj --program-headers -S %t | FileCheck --check-prefix=RW %s +# RUN: ld.lld a.o -o out -z noexecstack +# RUN: llvm-readobj --program-headers -S out | FileCheck --check-prefix=RW %s -# RUN: ld.lld %t1 -o %t -z nognustack -# RUN: llvm-readobj --program-headers -s %t | FileCheck --check-prefix=NOGNUSTACK %s +# RUN: ld.lld a.o -o out -z nognustack +# RUN: llvm-readobj --program-headers -s out | FileCheck --check-prefix=NOGNUSTACK %s # RW: Type: PT_GNU_STACK # RW-NEXT: Offset: 0x0 @@ -40,5 +43,19 @@ # NOGNUSTACK-NOT: Type: PT_GNU_STACK +# RUN: not ld.lld a.o x.o nox.o x.o 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error: +# RUN: not ld.lld a.o x.o nox.o x.o -z nognustack 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error: +# ERR-COUNT-2: error: x.o: requires an executable stack, but -z execstack is not specified + +# RUN: ld.lld a.o x.o nox.o x.o -z execstack --fatal-warnings +# RUN: ld.lld -r x.o --fatal-warnings + +#--- a.s .globl _start _start: + +#--- x.s +.section .note.GNU-stack,"x" + +#--- nox.s +.section .note.GNU-stack,"" From 6d4e72abb85a4b302204dee881894271a84dd322 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 22 Jan 2025 16:36:23 +0000 Subject: [PATCH 177/208] [GVN] Add extra vscale tests with different types. NFC --- llvm/test/Transforms/GVN/vscale.ll | 254 ++++++++++ llvm/test/Transforms/NewGVN/vscale.ll | 648 ++++++++++++++++++++++++++ 2 files changed, 902 insertions(+) create mode 100644 llvm/test/Transforms/NewGVN/vscale.ll diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll index 71adaed8e5722b..f6e0f8c1a64944 100644 --- a/llvm/test/Transforms/GVN/vscale.ll +++ b/llvm/test/Transforms/GVN/vscale.ll @@ -387,3 +387,257 @@ if.then: if.else: ret void } + +; Different sizes / types + +define @load_v16i8_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4f32_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4f32_store_v16i8_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4i32_store_v4f32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4i32_store_v4i64_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4i32_store_v4i64_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 32 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4i64_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4i64_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 32 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v2i32_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 8 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v2i32_store_v4i32_forward_load_offsets(ptr %p, %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsets( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr , ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[Q]], align 8 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %q = getelementptr , ptr %p, i64 1 + %load = load , ptr %q + ret %load +} + +define @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsetc( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr <2 x i32>, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[Q]], align 8 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %q = getelementptr <2 x i32>, ptr %p, i64 1 + %load = load , ptr %q + ret %load +} + +define @load_v2p0_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v2i64_store_v2p0_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_nxv16i8_store_v4i32_forward_load(ptr %p, <4 x i32> %x) { +; CHECK-LABEL: @load_nxv16i8_store_v4i32_forward_load( +; CHECK-NEXT: store <4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store <4 x i32> %x, ptr %p + %load = load , ptr %p + ret %load +} + +define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v16i8_store_nxv4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <16 x i8> [[LOAD]] +; + store %x, ptr %p + %load = load <16 x i8>, ptr %p + ret <16 x i8> %load +} + +define @load_v16i8_store_v4i32_forward_constant(ptr %p) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant( +; CHECK-NEXT: store splat (i32 4), ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store splat (i32 4), ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v16i8_struct_store_v4i32_forward_load(ptr %p, { } %x) { +; CHECK-LABEL: @load_v16i8_struct_store_v4i32_forward_load( +; CHECK-NEXT: store { } [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store { } %x, ptr %p + %load = load , ptr %p + ret %load +} + +define {} @load_v16i8_store_v4i32_struct_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_struct_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load { }, ptr [[P]], align 16 +; CHECK-NEXT: ret { } [[LOAD]] +; + store %x, ptr %p + %load = load { }, ptr %p + ret { } %load +} + +define { , , , } @bigexample({ , , , } %a) vscale_range(1,16) { +; CHECK-LABEL: @bigexample( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REF_TMP:%.*]] = alloca { , , , }, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: [[A_ELT:%.*]] = extractvalue { , , , } [[A:%.*]], 0 +; CHECK-NEXT: store [[A_ELT]], ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]] +; CHECK-NEXT: [[A_ELT2:%.*]] = extractvalue { , , , } [[A]], 1 +; CHECK-NEXT: store [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 5 +; CHECK-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]] +; CHECK-NEXT: [[A_ELT4:%.*]] = extractvalue { , , , } [[A]], 2 +; CHECK-NEXT: store [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 48 +; CHECK-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]] +; CHECK-NEXT: [[A_ELT6:%.*]] = extractvalue { , , , } [[A]], 3 +; CHECK-NEXT: store [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[DOTUNPACK:%.*]] = load , ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[DOTUNPACK]], 0 +; CHECK-NEXT: [[DOTUNPACK8:%.*]] = load , ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[DOTUNPACK8]], 1 +; CHECK-NEXT: [[DOTUNPACK10:%.*]] = load , ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[DOTUNPACK10]], 2 +; CHECK-NEXT: [[DOTUNPACK12:%.*]] = load , ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[DOTUNPACK12]], 3 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: ret { , , , } [[TMP15]] +; +entry: + %ref.tmp = alloca { , , , }, align 16 + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %ref.tmp) + %a.elt = extractvalue { , , , } %a, 0 + store %a.elt, ptr %ref.tmp, align 16 + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 4 + %ref.tmp.repack1 = getelementptr inbounds i8, ptr %ref.tmp, i64 %1 + %a.elt2 = extractvalue { , , , } %a, 1 + store %a.elt2, ptr %ref.tmp.repack1, align 16 + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 5 + %ref.tmp.repack3 = getelementptr inbounds i8, ptr %ref.tmp, i64 %3 + %a.elt4 = extractvalue { , , , } %a, 2 + store %a.elt4, ptr %ref.tmp.repack3, align 16 + %4 = call i64 @llvm.vscale.i64() + %5 = mul i64 %4, 48 + %ref.tmp.repack5 = getelementptr inbounds i8, ptr %ref.tmp, i64 %5 + %a.elt6 = extractvalue { , , , } %a, 3 + store %a.elt6, ptr %ref.tmp.repack5, align 16 + %.unpack = load , ptr %ref.tmp, align 16 + %6 = insertvalue { , , , } poison, %.unpack, 0 + %7 = call i64 @llvm.vscale.i64() + %8 = shl i64 %7, 4 + %.elt7 = getelementptr inbounds i8, ptr %ref.tmp, i64 %8 + %.unpack8 = load , ptr %.elt7, align 16 + %9 = insertvalue { , , , } %6, %.unpack8, 1 + %10 = call i64 @llvm.vscale.i64() + %11 = shl i64 %10, 5 + %.elt9 = getelementptr inbounds i8, ptr %ref.tmp, i64 %11 + %.unpack10 = load , ptr %.elt9, align 16 + %12 = insertvalue { , , , } %9, %.unpack10, 2 + %13 = call i64 @llvm.vscale.i64() + %14 = mul i64 %13, 48 + %.elt11 = getelementptr inbounds i8, ptr %ref.tmp, i64 %14 + %.unpack12 = load , ptr %.elt11, align 16 + %15 = insertvalue { , , , } %12, %.unpack12, 3 + call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp) + ret { , , , } %15 +} diff --git a/llvm/test/Transforms/NewGVN/vscale.ll b/llvm/test/Transforms/NewGVN/vscale.ll new file mode 100644 index 00000000000000..500d58baed1a2c --- /dev/null +++ b/llvm/test/Transforms/NewGVN/vscale.ll @@ -0,0 +1,648 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -passes=newgvn,dce | FileCheck %s + +; Analyze Load from clobbering Load. + +define @load_store_clobber_load(ptr %p) { +; CHECK-LABEL: @load_store_clobber_load( +; CHECK-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; CHECK-NEXT: store zeroinitializer, ptr undef, align 16 +; CHECK-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD1]] +; CHECK-NEXT: ret [[ADD]] +; + %load1 = load , ptr %p + store zeroinitializer, ptr undef + %load2 = load , ptr %p ; <- load to be eliminated + %add = add %load1, %load2 + ret %add +} + +define @load_store_clobber_load_mayalias(ptr %p, ptr %p2) { +; CHECK-LABEL: @load_store_clobber_load_mayalias( +; CHECK-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; CHECK-NEXT: store zeroinitializer, ptr [[P2:%.*]], align 16 +; CHECK-NEXT: [[LOAD2:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[SUB:%.*]] = sub [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret [[SUB]] +; + %load1 = load , ptr %p + store zeroinitializer, ptr %p2 + %load2 = load , ptr %p + %sub = sub %load1, %load2 + ret %sub +} + +define @load_store_clobber_load_noalias(ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: @load_store_clobber_load_noalias( +; CHECK-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; CHECK-NEXT: store zeroinitializer, ptr [[P2:%.*]], align 16 +; CHECK-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD1]] +; CHECK-NEXT: ret [[ADD]] +; + %load1 = load , ptr %p + store zeroinitializer, ptr %p2 + %load2 = load , ptr %p ; <- load to be eliminated + %add = add %load1, %load2 + ret %add +} + +; BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias. +define i32 @load_clobber_load_gep1(ptr %p) { +; CHECK-LABEL: @load_clobber_load_gep1( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %gep1 = getelementptr , ptr %p, i64 0, i64 1 + %load1 = load i32, ptr %gep1 + %gep2 = getelementptr i32, ptr %p, i64 1 + %load2 = load i32, ptr %gep2 ; <- load could be eliminated + %add = add i32 %load1, %load2 + ret i32 %add +} + +define i32 @load_clobber_load_gep2(ptr %p) { +; CHECK-LABEL: @load_clobber_load_gep2( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 4 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %gep1 = getelementptr , ptr %p, i64 1, i64 0 + %load1 = load i32, ptr %gep1 + %gep2 = getelementptr i32, ptr %p, i64 4 + %load2 = load i32, ptr %gep2 ; <- can not determine at compile-time if %load1 and %load2 are same addr + %add = add i32 %load1, %load2 + ret i32 %add +} + +; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias. +define i32 @load_clobber_load_gep3(ptr %p) { +; CHECK-LABEL: @load_clobber_load_gep3( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr , ptr [[P]], i64 1, i64 0 +; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[CAST:%.*]] = bitcast float [[LOAD2]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[CAST]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %gep1 = getelementptr , ptr %p, i64 1, i64 0 + %load1 = load i32, ptr %gep1 + %gep2 = getelementptr , ptr %p, i64 1, i64 0 + %load2 = load float, ptr %gep2 ; <- load could be eliminated + %cast = bitcast float %load2 to i32 + %add = add i32 %load1, %cast + ret i32 %add +} + +define @load_clobber_load_fence(ptr %p) { +; CHECK-LABEL: @load_clobber_load_fence( +; CHECK-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; CHECK-NEXT: call void asm "", "~{memory}"() +; CHECK-NEXT: [[LOAD2:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[SUB:%.*]] = sub [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret [[SUB]] +; + %load1 = load , ptr %p + call void asm "", "~{memory}"() + %load2 = load , ptr %p + %sub = sub %load1, %load2 + ret %sub +} + +define @load_clobber_load_sideeffect(ptr %p) { +; CHECK-LABEL: @load_clobber_load_sideeffect( +; CHECK-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; CHECK-NEXT: call void asm sideeffect "", ""() +; CHECK-NEXT: [[LOAD2:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret [[ADD]] +; + %load1 = load , ptr %p + call void asm sideeffect "", ""() + %load2 = load , ptr %p + %add = add %load1, %load2 + ret %add +} + +; Analyze Load from clobbering Store. + +define @store_forward_to_load(ptr %p) { +; CHECK-LABEL: @store_forward_to_load( +; CHECK-NEXT: store zeroinitializer, ptr [[P:%.*]], align 16 +; CHECK-NEXT: ret zeroinitializer +; + store zeroinitializer, ptr %p + %load = load , ptr %p + ret %load +} + +define @store_forward_to_load_sideeffect(ptr %p) { +; CHECK-LABEL: @store_forward_to_load_sideeffect( +; CHECK-NEXT: store zeroinitializer, ptr [[P:%.*]], align 16 +; CHECK-NEXT: call void asm sideeffect "", ""() +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store zeroinitializer, ptr %p + call void asm sideeffect "", ""() + %load = load , ptr %p + ret %load +} + +define i32 @store_clobber_load() { +; CHECK-LABEL: @store_clobber_load( +; CHECK-NEXT: [[ALLOC:%.*]] = alloca , align 16 +; CHECK-NEXT: store undef, ptr [[ALLOC]], align 16 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr , ptr [[ALLOC]], i32 0, i32 1 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[PTR]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; + %alloc = alloca + store undef, ptr %alloc + %ptr = getelementptr , ptr %alloc, i32 0, i32 1 + %load = load i32, ptr %ptr + ret i32 %load +} + +; Analyze Load from clobbering MemInst. + +declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) + +define i32 @memset_clobber_load(ptr %p) { +; CHECK-LABEL: @memset_clobber_load( +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; CHECK-NEXT: ret i32 16843009 +; + tail call void @llvm.memset.p0.i64(ptr %p, i8 1, i64 200, i1 false) + %gep = getelementptr , ptr %p, i64 0, i64 5 + %load = load i32, ptr %gep + ret i32 %load +} + +define i32 @memset_clobber_load_vscaled_base(ptr %p) { +; CHECK-LABEL: @memset_clobber_load_vscaled_base( +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr , ptr [[P]], i64 1, i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; + tail call void @llvm.memset.p0.i64(ptr %p, i8 1, i64 200, i1 false) + %gep = getelementptr , ptr %p, i64 1, i64 1 + %load = load i32, ptr %gep + ret i32 %load +} + +define i32 @memset_clobber_load_nonconst_index(ptr %p, i64 %idx1, i64 %idx2) { +; CHECK-LABEL: @memset_clobber_load_nonconst_index( +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr , ptr [[P]], i64 [[IDX1:%.*]], i64 [[IDX2:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; + tail call void @llvm.memset.p0.i64(ptr %p, i8 1, i64 200, i1 false) + %gep = getelementptr , ptr %p, i64 %idx1, i64 %idx2 + %load = load i32, ptr %gep + ret i32 %load +} + + +; Load elimination across BBs + +define ptr @load_from_alloc_replaced_with_undef() { +; CHECK-LABEL: @load_from_alloc_replaced_with_undef( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca , align 16 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr , ptr [[A]], i64 0, i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[LOAD]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store zeroinitializer, ptr [[A]], align 16 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret ptr [[A]] +; +entry: + %a = alloca + %gep = getelementptr , ptr %a, i64 0, i64 1 + %load = load i32, ptr %gep ; <- load to be eliminated + %tobool = icmp eq i32 %load, 0 ; <- icmp to be eliminated + br i1 %tobool, label %if.end, label %if.then + +if.then: + store zeroinitializer, ptr %a + br label %if.end + +if.end: + ret ptr %a +} + +define i32 @redundant_load_elimination_1(ptr %p) { +; CHECK-LABEL: @redundant_load_elimination_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP:%.*]] = getelementptr , ptr [[P:%.*]], i64 1, i64 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret i32 [[LOAD1]] +; +entry: + %gep = getelementptr , ptr %p, i64 1, i64 1 + %load1 = load i32, ptr %gep + %cmp = icmp eq i32 %load1, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %load2 = load i32, ptr %gep ; <- load to be eliminated + %add = add i32 %load1, %load2 + br label %if.end + +if.end: + %result = phi i32 [ %add, %if.then ], [ %load1, %entry ] + ret i32 %result +} + +; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as NoAlias. +define void @redundant_load_elimination_2(i1 %c, ptr %p, ptr %q) { +; CHECK-LABEL: @redundant_load_elimination_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 1, i64 1 +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr , ptr [[P]], i64 1, i64 0 +; CHECK-NEXT: store i32 1, ptr [[GEP2]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[T:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: store i32 [[T]], ptr [[Q:%.*]], align 4 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + %gep1 = getelementptr , ptr %p, i64 1, i64 1 + store i32 0, ptr %gep1 + %gep2 = getelementptr , ptr %p, i64 1, i64 0 + store i32 1, ptr %gep2 + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load i32, ptr %gep1 ; <- load could be eliminated + store i32 %t, ptr %q + ret void + +if.else: + ret void +} + +define void @redundant_load_elimination_zero_index(i1 %c, ptr %p, ptr %q) { +; CHECK-LABEL: @redundant_load_elimination_zero_index( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: store i32 1, ptr [[P]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 0, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + %gep1 = getelementptr , ptr %p, i64 0, i64 1 + store i32 0, ptr %gep1 + store i32 1, ptr %p + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load i32, ptr %gep1 ; <- load could be eliminated + store i32 %t, ptr %q + ret void + +if.else: + ret void +} + +define void @redundant_load_elimination_zero_index_1(i1 %c, ptr %p, ptr %q, i64 %i) { +; CHECK-LABEL: @redundant_load_elimination_zero_index_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[J:%.*]] = add i64 [[I:%.*]], 1 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 [[J]] +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr , ptr [[P]], i64 0, i64 [[I]] +; CHECK-NEXT: store i32 1, ptr [[GEP2]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 0, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + %j = add i64 %i, 1 + %gep1 = getelementptr , ptr %p, i64 0, i64 %j + store i32 0, ptr %gep1 + %gep2 = getelementptr , ptr %p, i64 0, i64 %i + store i32 1, ptr %gep2 + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load i32, ptr %gep1 ; <- load could be eliminated + store i32 %t, ptr %q + ret void + +if.else: + ret void +} +; TODO: load in if.then could have been eliminated +define void @missing_load_elimination(i1 %c, ptr %p, ptr %q, %v) { +; CHECK-LABEL: @missing_load_elimination( +; CHECK-NEXT: entry: +; CHECK-NEXT: store zeroinitializer, ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[P1:%.*]] = getelementptr , ptr [[P]], i64 1 +; CHECK-NEXT: store [[V:%.*]], ptr [[P1]], align 16 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[T:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: store [[T]], ptr [[Q:%.*]], align 16 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + store zeroinitializer, ptr %p + %p1 = getelementptr , ptr %p, i64 1 + store %v, ptr %p1 + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load , ptr %p ; load could be eliminated + store %t, ptr %q + ret void + +if.else: + ret void +} + +; Different sizes / types + +define @load_v16i8_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4f32_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4f32_store_v16i8_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4i32_store_v4f32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4i32_store_v4i64_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4i32_store_v4i64_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 32 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v4i64_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v4i64_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 32 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v2i32_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 8 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v2i32_store_v4i32_forward_load_offsets(ptr %p, %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsets( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr , ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[Q]], align 8 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %q = getelementptr , ptr %p, i64 1 + %load = load , ptr %q + ret %load +} + +define @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsetc( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr <2 x i32>, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[Q]], align 8 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %q = getelementptr <2 x i32>, ptr %p, i64 1 + %load = load , ptr %q + ret %load +} + +define @load_v2p0_store_v4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v2i64_store_v2p0_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store %x, ptr %p + %load = load , ptr %p + ret %load +} + +define @load_nxv16i8_store_v4i32_forward_load(ptr %p, <4 x i32> %x) { +; CHECK-LABEL: @load_nxv16i8_store_v4i32_forward_load( +; CHECK-NEXT: store <4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store <4 x i32> %x, ptr %p + %load = load , ptr %p + ret %load +} + +define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v16i8_store_nxv4i32_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <16 x i8> [[LOAD]] +; + store %x, ptr %p + %load = load <16 x i8>, ptr %p + ret <16 x i8> %load +} + +define @load_v16i8_store_v4i32_forward_constant(ptr %p) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant( +; CHECK-NEXT: store splat (i32 4), ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store splat (i32 4), ptr %p + %load = load , ptr %p + ret %load +} + +define @load_v16i8_struct_store_v4i32_forward_load(ptr %p, { } %x) { +; CHECK-LABEL: @load_v16i8_struct_store_v4i32_forward_load( +; CHECK-NEXT: store { } [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: ret [[LOAD]] +; + store { } %x, ptr %p + %load = load , ptr %p + ret %load +} + +define {} @load_v16i8_store_v4i32_struct_forward_load(ptr %p, %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_struct_forward_load( +; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load { }, ptr [[P]], align 16 +; CHECK-NEXT: ret { } [[LOAD]] +; + store %x, ptr %p + %load = load { }, ptr %p + ret { } %load +} + +define { , , , } @bigexample({ , , , } %a) vscale_range(1,16) { +; CHECK-LABEL: @bigexample( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REF_TMP:%.*]] = alloca { , , , }, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: [[A_ELT:%.*]] = extractvalue { , , , } [[A:%.*]], 0 +; CHECK-NEXT: store [[A_ELT]], ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]] +; CHECK-NEXT: [[A_ELT2:%.*]] = extractvalue { , , , } [[A]], 1 +; CHECK-NEXT: store [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 5 +; CHECK-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]] +; CHECK-NEXT: [[A_ELT4:%.*]] = extractvalue { , , , } [[A]], 2 +; CHECK-NEXT: store [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 48 +; CHECK-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]] +; CHECK-NEXT: [[A_ELT6:%.*]] = extractvalue { , , , } [[A]], 3 +; CHECK-NEXT: store [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[DOTUNPACK:%.*]] = load , ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[DOTUNPACK]], 0 +; CHECK-NEXT: [[DOTUNPACK8:%.*]] = load , ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[DOTUNPACK8]], 1 +; CHECK-NEXT: [[DOTUNPACK10:%.*]] = load , ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[DOTUNPACK10]], 2 +; CHECK-NEXT: [[DOTUNPACK12:%.*]] = load , ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[DOTUNPACK12]], 3 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: ret { , , , } [[TMP15]] +; +entry: + %ref.tmp = alloca { , , , }, align 16 + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %ref.tmp) + %a.elt = extractvalue { , , , } %a, 0 + store %a.elt, ptr %ref.tmp, align 16 + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 4 + %ref.tmp.repack1 = getelementptr inbounds i8, ptr %ref.tmp, i64 %1 + %a.elt2 = extractvalue { , , , } %a, 1 + store %a.elt2, ptr %ref.tmp.repack1, align 16 + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 5 + %ref.tmp.repack3 = getelementptr inbounds i8, ptr %ref.tmp, i64 %3 + %a.elt4 = extractvalue { , , , } %a, 2 + store %a.elt4, ptr %ref.tmp.repack3, align 16 + %4 = call i64 @llvm.vscale.i64() + %5 = mul i64 %4, 48 + %ref.tmp.repack5 = getelementptr inbounds i8, ptr %ref.tmp, i64 %5 + %a.elt6 = extractvalue { , , , } %a, 3 + store %a.elt6, ptr %ref.tmp.repack5, align 16 + %.unpack = load , ptr %ref.tmp, align 16 + %6 = insertvalue { , , , } poison, %.unpack, 0 + %7 = call i64 @llvm.vscale.i64() + %8 = shl i64 %7, 4 + %.elt7 = getelementptr inbounds i8, ptr %ref.tmp, i64 %8 + %.unpack8 = load , ptr %.elt7, align 16 + %9 = insertvalue { , , , } %6, %.unpack8, 1 + %10 = call i64 @llvm.vscale.i64() + %11 = shl i64 %10, 5 + %.elt9 = getelementptr inbounds i8, ptr %ref.tmp, i64 %11 + %.unpack10 = load , ptr %.elt9, align 16 + %12 = insertvalue { , , , } %9, %.unpack10, 2 + %13 = call i64 @llvm.vscale.i64() + %14 = mul i64 %13, 48 + %.elt11 = getelementptr inbounds i8, ptr %ref.tmp, i64 %14 + %.unpack12 = load , ptr %.elt11, align 16 + %15 = insertvalue { , , , } %12, %.unpack12, 3 + call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp) + ret { , , , } %15 +} From 3d72619d751994f3b2b13c1fbb38f5f2541ea0ae Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 23 Jan 2025 17:36:06 +0000 Subject: [PATCH 178/208] [InstCombine] Add a test for splitting scalable structs. NFC --- .../InstCombine/scalable-vector-struct.ll | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll index c9966be72fb518..e7d4a3b7d20444 100644 --- a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll +++ b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll @@ -28,3 +28,122 @@ define void @store(ptr %x, %y, %z) { store %struct.test %b, ptr %x ret void } + +define {, } @split_load(ptr %p) nounwind { +; CHECK-LABEL: define { , } @split_load +; CHECK-SAME: (ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R:%.*]] = load { , }, ptr [[P]], align 16 +; CHECK-NEXT: ret { , } [[R]] +; +entry: + %r = load {, }, ptr %p + ret {, } %r +} + +define {} @split_load_one(ptr %p) nounwind { +; CHECK-LABEL: define { } @split_load_one +; CHECK-SAME: (ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R_UNPACK:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[R1:%.*]] = insertvalue { } poison, [[R_UNPACK]], 0 +; CHECK-NEXT: ret { } [[R1]] +; +entry: + %r = load {}, ptr %p + ret {} %r +} + +define void @split_store({, } %x, ptr %p) nounwind { +; CHECK-LABEL: define void @split_store +; CHECK-SAME: ({ , } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store { , } [[X]], ptr [[P]], align 16 +; CHECK-NEXT: ret void +; +entry: + store {, } %x, ptr %p + ret void +} + +define void @split_store_one({} %x, ptr %p) nounwind { +; CHECK-LABEL: define void @split_store_one +; CHECK-SAME: ({ } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { } [[X]], 0 +; CHECK-NEXT: store [[TMP0]], ptr [[P]], align 16 +; CHECK-NEXT: ret void +; +entry: + store {} %x, ptr %p + ret void +} + +define {<16 x i8>, <16 x i8>} @check_v16i8_v4i32({<4 x i32>, <4 x i32>} %x, ptr %p) nounwind { +; CHECK-LABEL: define { <16 x i8>, <16 x i8> } @check_v16i8_v4i32 +; CHECK-SAME: ({ <4 x i32>, <4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[X]], 0 +; CHECK-NEXT: store <4 x i32> [[X_ELT]], ptr [[P]], align 16 +; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[X]], 1 +; CHECK-NEXT: store <4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16 +; CHECK-NEXT: [[R_UNPACK_CAST:%.*]] = bitcast <4 x i32> [[X_ELT]] to <16 x i8> +; CHECK-NEXT: [[TMP0:%.*]] = insertvalue { <16 x i8>, <16 x i8> } poison, <16 x i8> [[R_UNPACK_CAST]], 0 +; CHECK-NEXT: [[R_UNPACK4_CAST:%.*]] = bitcast <4 x i32> [[X_ELT2]] to <16 x i8> +; CHECK-NEXT: [[R5:%.*]] = insertvalue { <16 x i8>, <16 x i8> } [[TMP0]], <16 x i8> [[R_UNPACK4_CAST]], 1 +; CHECK-NEXT: ret { <16 x i8>, <16 x i8> } [[R5]] +; +entry: + store {<4 x i32>, <4 x i32>} %x, ptr %p + %r = load {<16 x i8>, <16 x i8>}, ptr %p + ret {<16 x i8>, <16 x i8>} %r +} + +define {, } @check_nxv16i8_nxv4i32({, } %x, ptr %p) nounwind { +; CHECK-LABEL: define { , } @check_nxv16i8_nxv4i32 +; CHECK-SAME: ({ , } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store { , } [[X]], ptr [[P]], align 16 +; CHECK-NEXT: [[R:%.*]] = load { , }, ptr [[P]], align 16 +; CHECK-NEXT: ret { , } [[R]] +; +entry: + store {, } %x, ptr %p + %r = load {, }, ptr %p + ret {, } %r +} + +define {, } @alloca_nxv16i8_nxv4i32({, } %x) nounwind { +; CHECK-LABEL: define { , } @alloca_nxv16i8_nxv4i32 +; CHECK-SAME: ({ , } [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = alloca { , }, align 16 +; CHECK-NEXT: store { , } [[X]], ptr [[P]], align 16 +; CHECK-NEXT: [[R:%.*]] = load { , }, ptr [[P]], align 16 +; CHECK-NEXT: ret { , } [[R]] +; +entry: + %p = alloca {, } + store {, } %x, ptr %p + %r = load {, }, ptr %p + ret {, } %r +} + +define { <16 x i8>, <32 x i8> } @differenttypes({ <4 x i32>, <8 x i32> } %a, ptr %p) { +; CHECK-LABEL: define { <16 x i8>, <32 x i8> } @differenttypes +; CHECK-SAME: ({ <4 x i32>, <8 x i32> } [[A:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[P]]) +; CHECK-NEXT: store { <4 x i32>, <8 x i32> } [[A]], ptr [[P]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load { <16 x i8>, <32 x i8> }, ptr [[P]], align 16 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[P]]) +; CHECK-NEXT: ret { <16 x i8>, <32 x i8> } [[TMP0]] +; +entry: + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %p) #5 + store { <4 x i32>, <8 x i32> } %a, ptr %p, align 16 + %2 = load { <16 x i8>, <32 x i8> }, ptr %p, align 16 + call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %p) #5 + ret { <16 x i8>, <32 x i8> } %2 +} From e0622245967514c27b538cc10e04184323c5f96e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 23 Jan 2025 09:45:51 -0800 Subject: [PATCH 179/208] [test] Remove misleading '' --- mlir/test/Transforms/inlining-recursive-self.mlir | 4 ++-- mlir/test/Transforms/inlining-recursive.mlir | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/test/Transforms/inlining-recursive-self.mlir b/mlir/test/Transforms/inlining-recursive-self.mlir index 5cc922db8e9786..01d5b8bd2a76c7 100644 --- a/mlir/test/Transforms/inlining-recursive-self.mlir +++ b/mlir/test/Transforms/inlining-recursive-self.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -inline='default-pipeline=''' | FileCheck %s -// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=''' | FileCheck %s +// RUN: mlir-opt %s -inline='default-pipeline=' | FileCheck %s +// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=' | FileCheck %s // CHECK-LABEL: func.func @b0 func.func @b0() { diff --git a/mlir/test/Transforms/inlining-recursive.mlir b/mlir/test/Transforms/inlining-recursive.mlir index a02fe69133ad87..403accd8b7ee8c 100644 --- a/mlir/test/Transforms/inlining-recursive.mlir +++ b/mlir/test/Transforms/inlining-recursive.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -inline='default-pipeline=''' | FileCheck %s -// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=''' | FileCheck %s +// RUN: mlir-opt %s -inline='default-pipeline=' | FileCheck %s +// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=' | FileCheck %s // CHECK-LABEL: func.func @foo0 func.func @foo0(%arg0 : i32) -> i32 { From ed512710a5e855a029a05f399335e03db0e704bd Mon Sep 17 00:00:00 2001 From: hidekisaito Date: Thu, 23 Jan 2025 09:46:56 -0800 Subject: [PATCH 180/208] [Offload] Make MemoryManager threshold ENV var size_t type. (#124063) --- offload/plugins-nextgen/common/include/MemoryManager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h index fe1989930b76ef..a4f6e628c403ab 100644 --- a/offload/plugins-nextgen/common/include/MemoryManager.h +++ b/offload/plugins-nextgen/common/include/MemoryManager.h @@ -324,7 +324,7 @@ class MemoryManagerTy { /// manager explicitly by setting the var to 0. If user doesn't specify /// anything, returns <0, true>. static std::pair getSizeThresholdFromEnv() { - static UInt32Envar MemoryManagerThreshold( + static UInt64Envar MemoryManagerThreshold( "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD", 0); size_t Threshold = MemoryManagerThreshold.get(); From 6a97897d5c159a52975bac19ac22c7913672c549 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 23 Jan 2025 09:47:10 -0800 Subject: [PATCH 181/208] [NFC][modules] Create objects on the stack (#124034) `ClangTool` change to fix memory leak. `FixedCompilationDatabase` changed just to makes it simpler. --- clang-tools-extra/modularize/CoverageChecker.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp index b536ee00497c03..fe6711398ab7dc 100644 --- a/clang-tools-extra/modularize/CoverageChecker.cpp +++ b/clang-tools-extra/modularize/CoverageChecker.cpp @@ -278,15 +278,15 @@ CoverageChecker::collectUmbrellaHeaderHeaders(StringRef UmbrellaHeaderName) { sys::fs::current_path(PathBuf); // Create the compilation database. - std::unique_ptr Compilations; - Compilations.reset(new FixedCompilationDatabase(Twine(PathBuf), CommandLine)); + FixedCompilationDatabase Compilations(Twine(PathBuf), CommandLine); std::vector HeaderPath; HeaderPath.push_back(std::string(UmbrellaHeaderName)); // Create the tool and run the compilation. - ClangTool Tool(*Compilations, HeaderPath); - int HadErrors = Tool.run(new CoverageCheckerFrontendActionFactory(*this)); + ClangTool Tool(Compilations, HeaderPath); + CoverageCheckerFrontendActionFactory ActionFactory(*this); + int HadErrors = Tool.run(&ActionFactory); // If we had errors, exit early. return !HadErrors; From c7e6ca76cb4be7b1707cb583cf4aa4d458b312aa Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 23 Jan 2025 09:47:41 -0800 Subject: [PATCH 182/208] [SLP][NFC]Add dump() method for ScheduleData struct type for better debugging --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4de632d4ef149d..c98d872fb6467f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3986,6 +3986,8 @@ class BoUpSLP { } } + LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } + Instruction *Inst = nullptr; /// The TreeEntry that this instruction corresponds to. From 66e49e38aeed92c48ba175f31e12b07a8c526d11 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 23 Jan 2025 09:52:34 -0800 Subject: [PATCH 183/208] [YAML] Don't validate `Fill::Size` after error (#123280) Size is required, so we don't know if it's in uninitialized state after the previous error. Triggers msan on llvm/test/tools/yaml2obj/ELF/custom-fill.yaml NOSIZE test. We have `Fill` Section with Pattern, but no size. Before the fix it produced error: ``` YAML:169:5: error: missing required key 'Size' - Type: Fill ^ YAML:169:5: error: "Size" can't be 0 when "Pattern" is not empty - Type: Fill ``` The same applies to `MachOYAML::Section` fields `content` and `size`. However `MachOYAML::Section` matches size first, so on error, content is not set anyway. Added error checking just in case. --- llvm/lib/ObjectYAML/ELFYAML.cpp | 4 +- llvm/lib/ObjectYAML/MachOYAML.cpp | 5 ++- llvm/test/ObjectYAML/MachO/section_data.yaml | 41 +++++++++++++++++++ llvm/test/tools/yaml2obj/ELF/custom-fill.yaml | 5 ++- 4 files changed, 51 insertions(+), 4 deletions(-) diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 7e94d01a971534..24f426a9aa1f7c 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1747,7 +1747,9 @@ void MappingTraits>::mapping( std::string MappingTraits>::validate( IO &io, std::unique_ptr &C) { if (const auto *F = dyn_cast(C.get())) { - if (F->Pattern && F->Pattern->binary_size() != 0 && !F->Size) + // Can't check the `Size`, as it's required and may be left uninitialized by + // previous error. + if (!io.error() && F->Pattern && F->Pattern->binary_size() != 0 && !F->Size) return "\"Size\" can't be 0 when \"Pattern\" is not empty"; return ""; } diff --git a/llvm/lib/ObjectYAML/MachOYAML.cpp b/llvm/lib/ObjectYAML/MachOYAML.cpp index 4857b5911ff2ef..b7eda97c22ae04 100644 --- a/llvm/lib/ObjectYAML/MachOYAML.cpp +++ b/llvm/lib/ObjectYAML/MachOYAML.cpp @@ -346,7 +346,10 @@ void MappingTraits::mapping(IO &IO, std::string MappingTraits::validate(IO &IO, MachOYAML::Section &Section) { - if (Section.content && Section.size < Section.content->binary_size()) + // Can't check the `size`, as it's required and may be left uninitialized by + // previous error. + if (!IO.error() && Section.content && + Section.size < Section.content->binary_size()) return "Section size must be greater than or equal to the content size"; return ""; } diff --git a/llvm/test/ObjectYAML/MachO/section_data.yaml b/llvm/test/ObjectYAML/MachO/section_data.yaml index 87c5bc803ee1a2..a2d9a3b7e1675b 100644 --- a/llvm/test/ObjectYAML/MachO/section_data.yaml +++ b/llvm/test/ObjectYAML/MachO/section_data.yaml @@ -159,3 +159,44 @@ LoadCommands: reserved2: 0x00000000 reserved3: 0x00000000 content: AA + +## Case 4: Don't validate if size is missing. +# RUN: not yaml2obj --docnum=4 %s -o %t1 2>&1 | FileCheck %s --check-prefix=CASE4 --implicit-check-not=error: +# CASE4: error: missing required key 'size' +# CASE4: error: failed to parse YAML + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x01000007 + cpusubtype: 0x00000003 + filetype: 0x00000001 + ncmds: 1 + sizeofcmds: 232 + flags: 0x00002000 + reserved: 0x00000000 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0 + vmsize: 4 + fileoff: 392 + filesize: 4 + maxprot: 7 + initprot: 7 + nsects: 1 + flags: 0 + Sections: + - sectname: __data + segname: __DATA + addr: 0x0000000000000000 + content: AA + offset: 0x00000188 + align: 2 + reloff: 0x00000000 + nreloc: 0 + flags: 0x00000000 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 diff --git a/llvm/test/tools/yaml2obj/ELF/custom-fill.yaml b/llvm/test/tools/yaml2obj/ELF/custom-fill.yaml index d770fdb9825327..cdb9a97889ac12 100644 --- a/llvm/test/tools/yaml2obj/ELF/custom-fill.yaml +++ b/llvm/test/tools/yaml2obj/ELF/custom-fill.yaml @@ -156,9 +156,10 @@ Sections: Pattern: "BB" ## Check that the "Size" field is mandatory. -# RUN: not yaml2obj --docnum=5 2>&1 %s | FileCheck %s --check-prefix=NOSIZE +# RUN: not yaml2obj --docnum=5 2>&1 %s | FileCheck %s --check-prefix=NOSIZE --implicit-check-not=error: -## NOSIZE: error: missing required key 'Size' +# NOSIZE: error: missing required key 'Size' +# NOSIZE: error: failed to parse YAML --- !ELF FileHeader: From ff17a4136dedba004d901a571c4fae501affd051 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Thu, 23 Jan 2025 12:54:35 -0500 Subject: [PATCH 184/208] [lldb] Remove support and workarounds for Android 4 and older (#124047) --- lldb/cmake/modules/LLDBConfig.cmake | 5 ---- lldb/include/lldb/Host/Time.h | 9 ------ lldb/source/Host/CMakeLists.txt | 1 - lldb/source/Host/android/LibcGlue.cpp | 28 ------------------- lldb/source/Host/common/Socket.cpp | 18 +----------- lldb/source/Host/posix/HostInfoPosix.cpp | 16 ----------- .../Host/posix/ProcessLauncherPosixFork.cpp | 5 +--- 7 files changed, 2 insertions(+), 80 deletions(-) delete mode 100644 lldb/source/Host/android/LibcGlue.cpp diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake index 9bb37f5967d4f3..747f7e6038181c 100644 --- a/lldb/cmake/modules/LLDBConfig.cmake +++ b/lldb/cmake/modules/LLDBConfig.cmake @@ -306,9 +306,4 @@ else() set(LLDB_CAN_USE_DEBUGSERVER OFF) endif() -if ((CMAKE_SYSTEM_NAME MATCHES "Android") AND LLVM_BUILD_STATIC AND - ((ANDROID_ABI MATCHES "armeabi") OR (ANDROID_ABI MATCHES "mips"))) - add_definitions(-DANDROID_USE_ACCEPT_WORKAROUND) -endif() - include(LLDBGenerateConfig) diff --git a/lldb/include/lldb/Host/Time.h b/lldb/include/lldb/Host/Time.h index aee4c43247c5a3..2ca5a4026884b7 100644 --- a/lldb/include/lldb/Host/Time.h +++ b/lldb/include/lldb/Host/Time.h @@ -11,15 +11,6 @@ #ifndef LLDB_HOST_TIME_H #define LLDB_HOST_TIME_H -#ifdef __ANDROID__ -#include -#endif - -#if defined(__ANDROID_API__) && __ANDROID_API__ < 21 -#include -extern time_t timegm(struct tm *t); -#else #include -#endif #endif // LLDB_HOST_TIME_H diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt index e0cd8569bf9575..cdfb6184f2219e 100644 --- a/lldb/source/Host/CMakeLists.txt +++ b/lldb/source/Host/CMakeLists.txt @@ -113,7 +113,6 @@ else() if (CMAKE_SYSTEM_NAME MATCHES "Android") add_host_subdirectory(android android/HostInfoAndroid.cpp - android/LibcGlue.cpp ) endif() elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") diff --git a/lldb/source/Host/android/LibcGlue.cpp b/lldb/source/Host/android/LibcGlue.cpp deleted file mode 100644 index 877d735823feee..00000000000000 --- a/lldb/source/Host/android/LibcGlue.cpp +++ /dev/null @@ -1,28 +0,0 @@ -//===-- LibcGlue.cpp ------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// This files adds functions missing from libc on earlier versions of Android - -#include - -#include - -#if __ANDROID_API__ < 21 - -#include -#include -#include -#include - -#include "lldb/Host/Time.h" - -time_t timegm(struct tm *t) { return (time_t)timegm64(t); } - -int posix_openpt(int flags) { return open("/dev/ptmx", flags); } - -#endif diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp index 0ccff41a552068..296c2273ba419c 100644 --- a/lldb/source/Host/common/Socket.cpp +++ b/lldb/source/Host/common/Socket.cpp @@ -472,23 +472,7 @@ Status Socket::Accept(const Timeout &timeout, Socket *&socket) { NativeSocket Socket::AcceptSocket(NativeSocket sockfd, struct sockaddr *addr, socklen_t *addrlen, Status &error) { error.Clear(); -#if defined(ANDROID_USE_ACCEPT_WORKAROUND) - // Hack: - // This enables static linking lldb-server to an API 21 libc, but still - // having it run on older devices. It is necessary because API 21 libc's - // implementation of accept() uses the accept4 syscall(), which is not - // available in older kernels. Using an older libc would fix this issue, but - // introduce other ones, as the old libraries were quite buggy. - int fd = syscall(__NR_accept, sockfd, addr, addrlen); - if (fd >= 0) { - int flags = ::fcntl(fd, F_GETFD); - if (flags != -1 && ::fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1) - return fd; - SetLastError(error); - close(fd); - } - return fd; -#elif defined(SOCK_CLOEXEC) && defined(HAVE_ACCEPT4) +#if defined(SOCK_CLOEXEC) && defined(HAVE_ACCEPT4) int flags = SOCK_CLOEXEC; NativeSocket fd = llvm::sys::RetryAfterSignal( static_cast(-1), ::accept4, sockfd, addr, addrlen, flags); diff --git a/lldb/source/Host/posix/HostInfoPosix.cpp b/lldb/source/Host/posix/HostInfoPosix.cpp index 193f584900b632..23ba3177de317a 100644 --- a/lldb/source/Host/posix/HostInfoPosix.cpp +++ b/lldb/source/Host/posix/HostInfoPosix.cpp @@ -86,13 +86,6 @@ std::optional HostInfoPosix::GetOSBuildString() { return std::string(un.release); } -#ifdef __ANDROID__ -#include -#endif -#if defined(__ANDROID_API__) && __ANDROID_API__ < 21 -#define USE_GETPWUID -#endif - namespace { class PosixUserIDResolver : public UserIDResolver { protected: @@ -107,14 +100,6 @@ struct PasswdEntry { }; static std::optional GetPassword(id_t uid) { -#ifdef USE_GETPWUID - // getpwuid_r is missing from android-9 - // The caller should provide some thread safety by making sure no one calls - // this function concurrently, because using getpwuid is ultimately not - // thread-safe as we don't know who else might be calling it. - if (auto *user_info_ptr = ::getpwuid(uid)) - return PasswdEntry{user_info_ptr->pw_name, user_info_ptr->pw_shell}; -#else struct passwd user_info; struct passwd *user_info_ptr = &user_info; char user_buffer[PATH_MAX]; @@ -124,7 +109,6 @@ static std::optional GetPassword(id_t uid) { user_info_ptr) { return PasswdEntry{user_info_ptr->pw_name, user_info_ptr->pw_shell}; } -#endif return std::nullopt; } diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp index 7b8b42a4b7fe07..22bf698c71716e 100644 --- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp +++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp @@ -25,13 +25,10 @@ #include #ifdef __ANDROID__ -#include #define PT_TRACE_ME PTRACE_TRACEME #endif -#if defined(__ANDROID_API__) && __ANDROID_API__ < 15 -#include -#elif defined(__linux__) +#if defined(__linux__) #include #endif From 5a7d92f7a09d5580a298c2982bd42918b7ec492c Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 23 Jan 2025 17:55:03 +0000 Subject: [PATCH 185/208] [NFC] Remove invalid features from test and autogenerate checks. (#124130) --- .../CodeGen/AArch64/cpu-supports-target.c | 203 ++++++++++++++++-- 1 file changed, 189 insertions(+), 14 deletions(-) diff --git a/clang/test/CodeGen/AArch64/cpu-supports-target.c b/clang/test/CodeGen/AArch64/cpu-supports-target.c index b185dda2881080..6223db7c092534 100644 --- a/clang/test/CodeGen/AArch64/cpu-supports-target.c +++ b/clang/test/CodeGen/AArch64/cpu-supports-target.c @@ -1,27 +1,150 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s -int check_all_feature() { +//. +// CHECK: @__aarch64_cpu_features = external dso_local global { i64 } +//. +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define dso_local i32 @check_all_features( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 66367 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 66367 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +// CHECK: [[IF_THEN]]: +// CHECK-NEXT: store i32 1, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN:.*]] +// CHECK: [[IF_ELSE]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 14272 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 14272 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label %[[IF_THEN1:.*]], label %[[IF_ELSE2:.*]] +// CHECK: [[IF_THEN1]]: +// CHECK-NEXT: store i32 2, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE2]]: +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 2065152 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 2065152 +// CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] +// CHECK-NEXT: br i1 [[TMP11]], label %[[IF_THEN3:.*]], label %[[IF_ELSE4:.*]] +// CHECK: [[IF_THEN3]]: +// CHECK-NEXT: store i32 3, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE4]]: +// CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 288230376183169792 +// CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 288230376183169792 +// CHECK-NEXT: [[TMP15:%.*]] = and i1 true, [[TMP14]] +// CHECK-NEXT: br i1 [[TMP15]], label %[[IF_THEN5:.*]], label %[[IF_ELSE6:.*]] +// CHECK: [[IF_THEN5]]: +// CHECK-NEXT: store i32 4, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE6]]: +// CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 1275134720 +// CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 1275134720 +// CHECK-NEXT: [[TMP19:%.*]] = and i1 true, [[TMP18]] +// CHECK-NEXT: br i1 [[TMP19]], label %[[IF_THEN7:.*]], label %[[IF_ELSE8:.*]] +// CHECK: [[IF_THEN7]]: +// CHECK-NEXT: store i32 5, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE8]]: +// CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 52814742272 +// CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 52814742272 +// CHECK-NEXT: [[TMP23:%.*]] = and i1 true, [[TMP22]] +// CHECK-NEXT: br i1 [[TMP23]], label %[[IF_THEN9:.*]], label %[[IF_ELSE10:.*]] +// CHECK: [[IF_THEN9]]: +// CHECK-NEXT: store i32 6, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE10]]: +// CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 344671224576 +// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 344671224576 +// CHECK-NEXT: [[TMP27:%.*]] = and i1 true, [[TMP26]] +// CHECK-NEXT: br i1 [[TMP27]], label %[[IF_THEN11:.*]], label %[[IF_ELSE12:.*]] +// CHECK: [[IF_THEN11]]: +// CHECK-NEXT: store i32 7, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE12]]: +// CHECK-NEXT: [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 3918083994400 +// CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 3918083994400 +// CHECK-NEXT: [[TMP31:%.*]] = and i1 true, [[TMP30]] +// CHECK-NEXT: br i1 [[TMP31]], label %[[IF_THEN13:.*]], label %[[IF_ELSE14:.*]] +// CHECK: [[IF_THEN13]]: +// CHECK-NEXT: store i32 8, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE14]]: +// CHECK-NEXT: [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP32]], 92359111017216 +// CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 92359111017216 +// CHECK-NEXT: [[TMP35:%.*]] = and i1 true, [[TMP34]] +// CHECK-NEXT: br i1 [[TMP35]], label %[[IF_THEN15:.*]], label %[[IF_ELSE16:.*]] +// CHECK: [[IF_THEN15]]: +// CHECK-NEXT: store i32 9, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE16]]: +// CHECK-NEXT: [[TMP36:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 10836786603360256 +// CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 10836786603360256 +// CHECK-NEXT: [[TMP39:%.*]] = and i1 true, [[TMP38]] +// CHECK-NEXT: br i1 [[TMP39]], label %[[IF_THEN17:.*]], label %[[IF_ELSE18:.*]] +// CHECK: [[IF_THEN17]]: +// CHECK-NEXT: store i32 10, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE18]]: +// CHECK-NEXT: [[TMP40:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP41:%.*]] = and i64 [[TMP40]], 54047593709241088 +// CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[TMP41]], 54047593709241088 +// CHECK-NEXT: [[TMP43:%.*]] = and i1 true, [[TMP42]] +// CHECK-NEXT: br i1 [[TMP43]], label %[[IF_THEN19:.*]], label %[[IF_ELSE20:.*]] +// CHECK: [[IF_THEN19]]: +// CHECK-NEXT: store i32 11, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE20]]: +// CHECK-NEXT: [[TMP44:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP45:%.*]] = and i64 [[TMP44]], 216177180294578944 +// CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[TMP45]], 216177180294578944 +// CHECK-NEXT: [[TMP47:%.*]] = and i1 true, [[TMP46]] +// CHECK-NEXT: br i1 [[TMP47]], label %[[IF_THEN21:.*]], label %[[IF_ELSE22:.*]] +// CHECK: [[IF_THEN21]]: +// CHECK-NEXT: store i32 12, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE22]]: +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[RETURN]]: +// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr [[RETVAL]], align 4 +// CHECK-NEXT: ret i32 [[TMP48]] +// +int check_all_features() { if (__builtin_cpu_supports("rng+flagm+flagm2+fp16fml+dotprod+sm4")) return 1; - else if (__builtin_cpu_supports("rdm+lse+fp+simd+crc+sha1+sha2+sha3")) + else if (__builtin_cpu_supports("rdm+lse+fp+simd+crc+sha2+sha3")) return 2; - else if (__builtin_cpu_supports("aes+pmull+fp16+dit+dpb+dpb2+jscvt")) + else if (__builtin_cpu_supports("aes+fp16+dit+dpb+dpb2+jscvt")) return 3; else if (__builtin_cpu_supports("fcma+rcpc+rcpc2+rcpc3+frintts")) return 4; else if (__builtin_cpu_supports("i8mm+bf16+sve")) return 5; - else if (__builtin_cpu_supports("sve+ebf16+i8mm+f32mm+f64mm")) + else if (__builtin_cpu_supports("sve+bf16+i8mm+f32mm+f64mm")) return 6; - else if (__builtin_cpu_supports("sve2+sve2-aes+sve2-pmull128")) + else if (__builtin_cpu_supports("sve2+sve2-aes")) return 7; else if (__builtin_cpu_supports("sve2-bitperm+sve2-sha3+sve2-sm4")) return 8; else if (__builtin_cpu_supports("sme+memtag+sb")) return 9; - else if (__builtin_cpu_supports("predres+ssbs+ssbs2+bti+ls64+ls64_v")) + else if (__builtin_cpu_supports("predres+ssbs+bti+ls64")) return 10; - else if (__builtin_cpu_supports("ls64_accdata+wfxt+sme-f64f64")) + else if (__builtin_cpu_supports("wfxt+sme-f64f64")) return 11; else if (__builtin_cpu_supports("sme-i16i64+sme2")) return 12; @@ -29,16 +152,62 @@ int check_all_feature() { return 0; } -// CHECK-LABEL: define dso_local i32 @neon_code() #1 +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define dso_local i32 @neon_code( +// CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i32 1 +// int __attribute__((target("simd"))) neon_code() { return 1; } -// CHECK-LABEL: define dso_local i32 @sve_code() #2 +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define dso_local i32 @sve_code( +// CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i32 2 +// int __attribute__((target("sve"))) sve_code() { return 2; } -// CHECK-LABEL: define dso_local i32 @code() #0 +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define dso_local i32 @code( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i32 3 +// int code() { return 3; } -// CHECK-LABEL: define dso_local i32 @test_versions() #0 +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define dso_local i32 @test_versions( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073807616 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073807616 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +// CHECK: [[IF_THEN]]: +// CHECK-NEXT: [[CALL:%.*]] = call i32 @sve_code() +// CHECK-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN:.*]] +// CHECK: [[IF_ELSE]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 768 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 768 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label %[[IF_THEN1:.*]], label %[[IF_ELSE3:.*]] +// CHECK: [[IF_THEN1]]: +// CHECK-NEXT: [[CALL2:%.*]] = call i32 @neon_code() +// CHECK-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[IF_ELSE3]]: +// CHECK-NEXT: [[CALL4:%.*]] = call i32 @code() +// CHECK-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label %[[RETURN]] +// CHECK: [[RETURN]]: +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[RETVAL]], align 4 +// CHECK-NEXT: ret i32 [[TMP8]] +// int test_versions() { if (__builtin_cpu_supports("sve")) return sve_code(); @@ -47,6 +216,12 @@ int test_versions() { else return code(); } -// CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } -// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" } + +//. +// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } +// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. From e0cd57decb3aa9eb911b62306b8f8ac88fd97ffd Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Thu, 23 Jan 2025 13:00:39 -0500 Subject: [PATCH 186/208] [compiler-rt] Remove support and workarounds for Android 4 and older (#124056) --- compiler-rt/lib/asan/tests/asan_test.cpp | 6 +---- compiler-rt/lib/lsan/lsan_common_linux.cpp | 5 ---- .../lib/sanitizer_common/sanitizer_common.h | 1 - .../sanitizer_common/sanitizer_getauxval.h | 23 +++++++++---------- .../lib/sanitizer_common/sanitizer_linux.cpp | 11 +-------- .../sanitizer_linux_libcdep.cpp | 10 +------- .../sanitizer_platform_limits_posix.cpp | 2 +- 7 files changed, 15 insertions(+), 43 deletions(-) diff --git a/compiler-rt/lib/asan/tests/asan_test.cpp b/compiler-rt/lib/asan/tests/asan_test.cpp index 09d71569f89bba..56377bde1c8deb 100644 --- a/compiler-rt/lib/asan/tests/asan_test.cpp +++ b/compiler-rt/lib/asan/tests/asan_test.cpp @@ -1166,13 +1166,9 @@ TEST(AddressSanitizer, DISABLED_StressStackReuseAndExceptionsTest) { #if !defined(_WIN32) TEST(AddressSanitizer, MlockTest) { -#if !defined(__ANDROID__) || __ANDROID_API__ >= 17 EXPECT_EQ(0, mlockall(MCL_CURRENT)); -#endif - EXPECT_EQ(0, mlock((void*)0x12345, 0x5678)); -#if !defined(__ANDROID__) || __ANDROID_API__ >= 17 + EXPECT_EQ(0, mlock((void *)0x12345, 0x5678)); EXPECT_EQ(0, munlockall()); -#endif EXPECT_EQ(0, munlock((void*)0x987, 0x654)); } #endif diff --git a/compiler-rt/lib/lsan/lsan_common_linux.cpp b/compiler-rt/lib/lsan/lsan_common_linux.cpp index 7a0b2f038be0d3..6fd54bbea3c722 100644 --- a/compiler-rt/lib/lsan/lsan_common_linux.cpp +++ b/compiler-rt/lib/lsan/lsan_common_linux.cpp @@ -93,11 +93,6 @@ static int ProcessGlobalRegionsCallback(struct dl_phdr_info *info, size_t size, return 0; } -#if SANITIZER_ANDROID && __ANDROID_API__ < 21 -extern "C" __attribute__((weak)) int dl_iterate_phdr( - int (*)(struct dl_phdr_info *, size_t, void *), void *); -#endif - // Scans global variables for heap pointers. void ProcessGlobalRegions(Frontier *frontier) { if (!flags()->use_globals) return; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index 0b5e68c5fd7978..d9e7ded593feb3 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -927,7 +927,6 @@ typedef void (*RangeIteratorCallback)(uptr begin, uptr end, void *arg); enum AndroidApiLevel { ANDROID_NOT_ANDROID = 0, - ANDROID_KITKAT = 19, ANDROID_LOLLIPOP_MR1 = 22, ANDROID_POST_LOLLIPOP = 23 }; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h b/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h index 38439e44f611e6..910590b627c277 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h @@ -21,22 +21,21 @@ #if SANITIZER_LINUX || SANITIZER_FUCHSIA -# if (__GLIBC_PREREQ(2, 16) || (SANITIZER_ANDROID && __ANDROID_API__ >= 21) || \ - SANITIZER_FUCHSIA) && \ - !SANITIZER_GO -# define SANITIZER_USE_GETAUXVAL 1 -# else -# define SANITIZER_USE_GETAUXVAL 0 -# endif - -# if SANITIZER_USE_GETAUXVAL -# include -# else +# if (__GLIBC_PREREQ(2, 16) || SANITIZER_ANDROID || SANITIZER_FUCHSIA) && \ + !SANITIZER_GO +# define SANITIZER_USE_GETAUXVAL 1 +# else +# define SANITIZER_USE_GETAUXVAL 0 +# endif + +# if SANITIZER_USE_GETAUXVAL +# include +# else // The weak getauxval definition allows to check for the function at runtime. // This is useful for Android, when compiled at a lower API level yet running // on a more recent platform that offers the function. extern "C" SANITIZER_WEAK_ATTRIBUTE unsigned long getauxval(unsigned long type); -# endif +# endif #elif SANITIZER_NETBSD diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index 04b095dca904ab..997b95f343d41e 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -1849,11 +1849,6 @@ int internal_uname(struct utsname *buf) { # endif # if SANITIZER_ANDROID -# if __ANDROID_API__ < 21 -extern "C" __attribute__((weak)) int dl_iterate_phdr( - int (*)(struct dl_phdr_info *, size_t, void *), void *); -# endif - static int dl_iterate_phdr_test_cb(struct dl_phdr_info *info, size_t size, void *data) { // Any name starting with "lib" indicates a bug in L where library base names @@ -1869,9 +1864,7 @@ static int dl_iterate_phdr_test_cb(struct dl_phdr_info *info, size_t size, static atomic_uint32_t android_api_level; static AndroidApiLevel AndroidDetectApiLevelStatic() { -# if __ANDROID_API__ <= 19 - return ANDROID_KITKAT; -# elif __ANDROID_API__ <= 22 +# if __ANDROID_API__ <= 22 return ANDROID_LOLLIPOP_MR1; # else return ANDROID_POST_LOLLIPOP; @@ -1879,8 +1872,6 @@ static AndroidApiLevel AndroidDetectApiLevelStatic() { } static AndroidApiLevel AndroidDetectApiLevel() { - if (!&dl_iterate_phdr) - return ANDROID_KITKAT; // K or lower bool base_name_seen = false; dl_iterate_phdr(dl_iterate_phdr_test_cb, &base_name_seen); if (base_name_seen) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index 62b1dc43dce136..e11eff13cd3262 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -773,11 +773,6 @@ static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) { return 0; } -# if SANITIZER_ANDROID && __ANDROID_API__ < 21 -extern "C" __attribute__((weak)) int dl_iterate_phdr( - int (*)(struct dl_phdr_info *, size_t, void *), void *); -# endif - static bool requiresProcmaps() { # if SANITIZER_ANDROID && __ANDROID_API__ <= 22 // Fall back to /proc/maps if dl_iterate_phdr is unavailable or broken. @@ -940,11 +935,8 @@ extern "C" SANITIZER_WEAK_ATTRIBUTE int __android_log_write(int prio, void WriteOneLineToSyslog(const char *s) { if (&async_safe_write_log) { async_safe_write_log(SANITIZER_ANDROID_LOG_INFO, GetProcessName(), s); - } else if (AndroidGetApiLevel() > ANDROID_KITKAT) { - syslog(LOG_INFO, "%s", s); } else { - CHECK(&__android_log_write); - __android_log_write(SANITIZER_ANDROID_LOG_INFO, nullptr, s); + syslog(LOG_INFO, "%s", s); } } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp index ddd67cb43524d4..a5311d266b0c45 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp @@ -1093,7 +1093,7 @@ CHECK_SIZE_AND_OFFSET(cmsghdr, cmsg_len); CHECK_SIZE_AND_OFFSET(cmsghdr, cmsg_level); CHECK_SIZE_AND_OFFSET(cmsghdr, cmsg_type); -#if SANITIZER_LINUX && (__ANDROID_API__ >= 21 || __GLIBC_PREREQ (2, 14)) +# if SANITIZER_LINUX && (SANITIZER_ANDROID || __GLIBC_PREREQ(2, 14)) CHECK_TYPE_SIZE(mmsghdr); CHECK_SIZE_AND_OFFSET(mmsghdr, msg_hdr); CHECK_SIZE_AND_OFFSET(mmsghdr, msg_len); From 6045146014151a8f63a60612445de9ff6af47626 Mon Sep 17 00:00:00 2001 From: Alex Prabhat Bara <50404684+alexprabhat99@users.noreply.github.com> Date: Thu, 23 Jan 2025 23:31:39 +0530 Subject: [PATCH 187/208] [libc] change return type of pthread_setspecific to int in generated header (#124072) Fixes: #124032 --- libc/include/pthread.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/include/pthread.yaml b/libc/include/pthread.yaml index 4f386bdd11cfd7..5b27e68d2f2d8d 100644 --- a/libc/include/pthread.yaml +++ b/libc/include/pthread.yaml @@ -402,7 +402,7 @@ functions: - name: pthread_setspecific standards: - POSIX - return_type: void * + return_type: int arguments: - type: pthread_key_t - type: const void * From 02906931654460ca04a4b74f6aef65b542c73d2d Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Thu, 23 Jan 2025 10:02:38 -0800 Subject: [PATCH 188/208] [llvm-profdata] XFAIL broken test on windows (#124165) XFAIL `llvm/test/tools/llvm-profdata/general.proftext` after it was accidentally broken by https://github.com/llvm/llvm-project/pull/105915/. I will follow up to get this fixed. --- llvm/test/tools/llvm-profdata/general.proftext | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/tools/llvm-profdata/general.proftext b/llvm/test/tools/llvm-profdata/general.proftext index 89762f2540f6a6..ca532f9a37116d 100644 --- a/llvm/test/tools/llvm-profdata/general.proftext +++ b/llvm/test/tools/llvm-profdata/general.proftext @@ -1,3 +1,5 @@ +# FIXME: Somehow this is failing on windows after https://github.com/llvm/llvm-project/pull/105915 +# XFAIL: system-windows # RUN: llvm-profdata merge -sparse=true %s -o %t.profdata # RUN: llvm-profdata merge -sparse=false %s -o %t.profdata.dense From db6b7a84e6e4949569e756f46357d9f54ad16a03 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 23 Jan 2025 12:02:54 -0600 Subject: [PATCH 189/208] [libc][NFC] Strip all training whitespace and missing newlines (#124163) --- libc/benchmarks/distributions/README.md | 2 +- libc/benchmarks/gpu/CMakeLists.txt | 2 +- libc/config/baremetal/riscv/entrypoints.txt | 2 +- libc/config/linux/aarch64/entrypoints.txt | 2 +- libc/config/linux/arm/entrypoints.txt | 4 ++-- libc/config/linux/riscv/entrypoints.txt | 4 ++-- libc/config/linux/x86_64/entrypoints.txt | 6 +++--- libc/docs/dev/printf_behavior.rst | 2 +- libc/docs/dev/undefined_behavior.rst | 4 ++-- libc/docs/gpu/rpc.rst | 4 ++-- libc/docs/platform_support.rst | 2 +- libc/fuzzing/__support/CMakeLists.txt | 2 +- libc/include/llvm-libc-types/CMakeLists.txt | 12 ++++++------ libc/include/stdfix.yaml | 2 +- libc/include/sys/uio.yaml | 2 +- libc/src/__support/CPP/CMakeLists.txt | 4 ++-- libc/src/__support/threads/linux/CMakeLists.txt | 2 +- libc/src/__support/threads/spin_lock.h | 2 +- libc/src/__support/time/windows/CMakeLists.txt | 2 +- libc/src/math/nvptx/CMakeLists.txt | 2 +- libc/src/stdlib/CMakeLists.txt | 6 +++--- libc/test/src/__support/File/CMakeLists.txt | 2 +- libc/test/src/math/CMakeLists.txt | 4 ++-- libc/test/src/math/smoke/CMakeLists.txt | 2 +- libc/test/src/signal/CMakeLists.txt | 6 +++--- libc/test/src/stdfix/CMakeLists.txt | 2 +- libc/test/src/sys/resource/CMakeLists.txt | 2 +- libc/test/src/sys/select/CMakeLists.txt | 4 ++-- libc/test/src/sys/sendfile/CMakeLists.txt | 2 +- libc/test/src/sys/wait/CMakeLists.txt | 4 ++-- libc/test/src/unistd/CMakeLists.txt | 4 ++-- libc/utils/docgen/aio.yaml | 2 +- libc/utils/docgen/net/if.yaml | 2 +- libc/utils/docgen/netinet/in.yaml | 2 +- libc/utils/docgen/sys/resource.yaml | 2 +- libc/utils/docgen/sys/stat.yaml | 4 ++-- libc/utils/docgen/sys/time.yaml | 2 +- libc/utils/docgen/sys/wait.yaml | 2 +- libc/utils/docgen/termios.yaml | 12 ++++++------ libc/utils/mathtools/worst_case.sollya | 4 ++-- 40 files changed, 67 insertions(+), 67 deletions(-) diff --git a/libc/benchmarks/distributions/README.md b/libc/benchmarks/distributions/README.md index 135ba7bc822218..9c665885419dcb 100644 --- a/libc/benchmarks/distributions/README.md +++ b/libc/benchmarks/distributions/README.md @@ -31,4 +31,4 @@ As identified in the [automemcpy](https://research.google/pubs/pub50338/) paper: ## Note -Except for `GoogleD`, all distributions are gathered over one week worth of data. \ No newline at end of file +Except for `GoogleD`, all distributions are gathered over one week worth of data. diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 5fa3e44e8d48c3..b58f4fd8b1a429 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -10,7 +10,7 @@ function(add_benchmark benchmark_name) "LINK_LIBRARIES;DEPENDS" # Multi-value arguments ${ARGN} ) - + if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) message(FATAL_ERROR "target does not support clock") endif() diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index 6dc5df830eb000..667ab40dca9998 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -463,7 +463,7 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.kbits libc.src.stdfix.ukbits libc.src.stdfix.lkbits - libc.src.stdfix.ulkbits + libc.src.stdfix.ulkbits ) endif() diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index f5ba3414117682..6e5ecba6200a4b 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -439,7 +439,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.fabs libc.src.math.fabsf libc.src.math.fabsl - libc.src.math.fadd + libc.src.math.fadd libc.src.math.faddl libc.src.math.fadd libc.src.math.fdim diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt index 83f4dfaaa2d0f2..42ea803baac047 100644 --- a/libc/config/linux/arm/entrypoints.txt +++ b/libc/config/linux/arm/entrypoints.txt @@ -215,7 +215,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.complex.cproj libc.src.complex.cprojf libc.src.complex.cprojl - + # fenv.h entrypoints libc.src.fenv.feclearexcept libc.src.fenv.fedisableexcept @@ -268,7 +268,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.fabs libc.src.math.fabsf libc.src.math.fabsl - libc.src.math.fadd + libc.src.math.fadd libc.src.math.faddl libc.src.math.fadd libc.src.math.fdim diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 49a8d61b938027..36339126b1f22f 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -432,7 +432,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.fabs libc.src.math.fabsf libc.src.math.fabsl - libc.src.math.fadd + libc.src.math.fadd libc.src.math.faddl libc.src.math.fadd libc.src.math.fdim @@ -630,7 +630,7 @@ if(LIBC_TYPES_HAS_CFLOAT128) ) endif() -if(LIBC_TYPES_HAS_FLOAT128) +if(LIBC_TYPES_HAS_FLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 0c1ae9561a7e69..6662175c530217 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -377,7 +377,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.complex.cproj libc.src.complex.cprojf libc.src.complex.cprojl - + # fenv.h entrypoints libc.src.fenv.feclearexcept libc.src.fenv.fedisableexcept @@ -440,7 +440,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.fabs libc.src.math.fabsf libc.src.math.fabsl - libc.src.math.fadd + libc.src.math.fadd libc.src.math.faddl libc.src.math.fadd libc.src.math.fdim @@ -756,7 +756,7 @@ if(LIBC_TYPES_HAS_CFLOAT128) endif() if(LIBC_TYPES_HAS_FLOAT128) - list(APPEND TARGET_LIBM_ENTRYPOINTS + list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 libc.src.math.ceilf128 diff --git a/libc/docs/dev/printf_behavior.rst b/libc/docs/dev/printf_behavior.rst index f5507c4d167587..01ab128a1f238b 100644 --- a/libc/docs/dev/printf_behavior.rst +++ b/libc/docs/dev/printf_behavior.rst @@ -173,7 +173,7 @@ If a number passed as a field width or precision value is out of range for an int, then it will be treated as the largest value in the int range (e.g. "%-999999999999.999999999999s" is the same as "%-2147483647.2147483647s"). -If the field width is set to INT_MIN by using the '*' form, +If the field width is set to INT_MIN by using the '*' form, e.g. printf("%*d", INT_MIN, 1), it will be treated as INT_MAX, since -INT_MIN is not representable as an int. diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst index d0d882b7010e37..60fda51e86452c 100644 --- a/libc/docs/dev/undefined_behavior.rst +++ b/libc/docs/dev/undefined_behavior.rst @@ -78,8 +78,8 @@ POSIX.1 leaves that when the name of a shared memory object does not begin with Handling of NULL arguments to the 's' format specifier ------------------------------------------------------ The C standard does not specify behavior for ``printf("%s", NULL)``. We will -print the string literal ``(null)`` unless using the -``LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS`` option described in :ref:`printf +print the string literal ``(null)`` unless using the +``LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS`` option described in :ref:`printf behavior`. Unknown Math Rounding Direction diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst index 0d169c7db9a50f..3f312bb9281b38 100644 --- a/libc/docs/gpu/rpc.rst +++ b/libc/docs/gpu/rpc.rst @@ -253,7 +253,7 @@ linked in by forwarding the static library to the device-side link job. Extensions ---------- -The opcode is a 32-bit integer that must be unique to the requested operation. -All opcodes used by ``libc`` internally have the character ``c`` in the most +The opcode is a 32-bit integer that must be unique to the requested operation. +All opcodes used by ``libc`` internally have the character ``c`` in the most significant byte. Any other opcode is available for use outside of the ``libc`` implementation. diff --git a/libc/docs/platform_support.rst b/libc/docs/platform_support.rst index 2ce3d7282b304f..4643d82e2308b2 100644 --- a/libc/docs/platform_support.rst +++ b/libc/docs/platform_support.rst @@ -12,7 +12,7 @@ For Linux, we support kernel versions as listed on date), ``stable``, and ``mainline`` versions. We actively adopt new features from ``linux-next``. -For Windows, we plan to support products within their lifecycle. Please refer to +For Windows, we plan to support products within their lifecycle. Please refer to `Search Product and Services Lifecycle Information `_ for more information. LLVM-libc does not guarantee backward compatibility with operating systems that diff --git a/libc/fuzzing/__support/CMakeLists.txt b/libc/fuzzing/__support/CMakeLists.txt index d4b324db716f68..7742ee43860d53 100644 --- a/libc/fuzzing/__support/CMakeLists.txt +++ b/libc/fuzzing/__support/CMakeLists.txt @@ -22,7 +22,7 @@ add_libc_fuzzer( libc.src.__support.HashTable.table COMPILE_OPTIONS -D__LIBC_EXPLICIT_SIMD_OPT -) +) # TODO: FreeListHeap uses the _end symbol which conflicts with the _end symbol # defined by GPU start.cpp files so for now we exclude this fuzzer on GPU. diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index ee734eafce3620..6cbaa1ac0b30c3 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -102,15 +102,15 @@ add_header(__getoptargv_t HDR __getoptargv_t.h) add_header(wchar_t HDR wchar_t.h) add_header(char8_t HDR char8_t.h) add_header( - char16_t - HDR + char16_t + HDR char16_t.h DEPENDS libc.include.llvm-libc-macros.stdint_macros ) add_header( - char32_t - HDR + char32_t + HDR char32_t.h DEPENDS libc.include.llvm-libc-macros.stdint_macros @@ -145,8 +145,8 @@ add_header(cfloat16 HDR cfloat16.h) add_header(fsblkcnt_t HDR fsblkcnt_t.h) add_header(fsfilcnt_t HDR fsfilcnt_t.h) add_header( - struct_statvfs -HDR + struct_statvfs +HDR struct_statvfs.h DEPENDS .fsblkcnt_t diff --git a/libc/include/stdfix.yaml b/libc/include/stdfix.yaml index 7b3bdba082dd5c..9663ac0c7df4dc 100644 --- a/libc/include/stdfix.yaml +++ b/libc/include/stdfix.yaml @@ -1,7 +1,7 @@ header: stdfix.h header_template: stdfix.h.def macros: [] -types: +types: - type_name: stdfix-types enums: [] objects: [] diff --git a/libc/include/sys/uio.yaml b/libc/include/sys/uio.yaml index 808d8ec790198e..87c5bdff48245c 100644 --- a/libc/include/sys/uio.yaml +++ b/libc/include/sys/uio.yaml @@ -1,7 +1,7 @@ header: sys/uio.h header_template: uio.h.def macros: [] -types: +types: - type_name: struct_iovec - type_name: ssize_t enums: [] diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt index 15fad9de0ed6d2..d2ba00a5384da5 100644 --- a/libc/src/__support/CPP/CMakeLists.txt +++ b/libc/src/__support/CPP/CMakeLists.txt @@ -83,7 +83,7 @@ add_header_library( .string_view libc.hdr.func.free libc.hdr.func.malloc - libc.hdr.func.realloc + libc.hdr.func.realloc libc.src.__support.common libc.src.__support.integer_to_string libc.src.string.memory_utils.inline_memcpy @@ -203,7 +203,7 @@ add_object_library( DEPENDS libc.hdr.func.free libc.hdr.func.malloc - libc.hdr.func.aligned_alloc + libc.hdr.func.aligned_alloc libc.src.__support.common libc.src.__support.macros.properties.os ) diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt index 47598d98c98863..364e7e2b905854 100644 --- a/libc/src/__support/threads/linux/CMakeLists.txt +++ b/libc/src/__support/threads/linux/CMakeLists.txt @@ -79,7 +79,7 @@ add_object_library( .futex_utils libc.config.app_h libc.include.sys_syscall - libc.hdr.fcntl_macros + libc.hdr.fcntl_macros libc.src.errno.errno libc.src.__support.CPP.atomic libc.src.__support.CPP.stringstream diff --git a/libc/src/__support/threads/spin_lock.h b/libc/src/__support/threads/spin_lock.h index e176ad9eeac2ab..3b424b30425a76 100644 --- a/libc/src/__support/threads/spin_lock.h +++ b/libc/src/__support/threads/spin_lock.h @@ -34,7 +34,7 @@ class SpinLock { // .LBB0_2: | jmp .LBB0_4 // isb | .LBB0_2: // .LBB0_3: | pause - // ldrb w9, [x0] | .LBB0_3: + // ldrb w9, [x0] | .LBB0_3: // tbnz w9, #0, .LBB0_2 | movzx eax, byte ptr [rdi] // b .LBB0_1 | test al, 1 // .LBB0_4: | jne .LBB0_2 diff --git a/libc/src/__support/time/windows/CMakeLists.txt b/libc/src/__support/time/windows/CMakeLists.txt index dd0ac2f2f79aee..f31dfbdf26040c 100644 --- a/libc/src/__support/time/windows/CMakeLists.txt +++ b/libc/src/__support/time/windows/CMakeLists.txt @@ -1,5 +1,5 @@ add_header_library( - performance_counter + performance_counter HDRS performance_counter.h DEPENDS diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt index 577d8147df433c..e85e17b6701fe3 100644 --- a/libc/src/math/nvptx/CMakeLists.txt +++ b/libc/src/math/nvptx/CMakeLists.txt @@ -9,7 +9,7 @@ if(CUDAToolkit_FOUND) if (EXISTS ${libdevice_path}) message(STATUS "Found the CUDA device library. Implementations falling back " "to the vendor libraries will be resolved statically.") - set(bitcode_link_flags + set(bitcode_link_flags "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}") endif() else() diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index 40ba9ead9a7ae6..73a9fbf1e2ddc3 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -539,7 +539,7 @@ add_entrypoint_object( libc.src.__support.OSUtil.osutil ) -# TODO: Move all exit functions to linux specific +# TODO: Move all exit functions to linux specific if(TARGET libc.src.__support.threads.mutex) add_header_library( @@ -563,7 +563,7 @@ add_entrypoint_object( HDRS atexit.h CXX_STANDARD - 20 # For constinit + 20 # For constinit DEPENDS .exit_handler ) @@ -575,7 +575,7 @@ add_entrypoint_object( HDRS at_quick_exit.h CXX_STANDARD - 20 # For constinit + 20 # For constinit DEPENDS .exit_handler .atexit diff --git a/libc/test/src/__support/File/CMakeLists.txt b/libc/test/src/__support/File/CMakeLists.txt index 04205166bf5337..a11f52978f35f6 100644 --- a/libc/test/src/__support/File/CMakeLists.txt +++ b/libc/test/src/__support/File/CMakeLists.txt @@ -16,7 +16,7 @@ add_libc_test( DEPENDS libc.include.stdio libc.hdr.types.size_t - libc.src.errno.errno + libc.src.errno.errno libc.src.__support.CPP.new libc.src.__support.File.file ) diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index ae8518ee4b4cc1..bbcdf2363c1e2b 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -104,7 +104,7 @@ add_fp_unittest( SRCS sinf16_test.cpp DEPENDS - libc.src.math.sinf16 + libc.src.math.sinf16 ) add_fp_unittest( @@ -198,7 +198,7 @@ add_fp_unittest( SRCS tanf16_test.cpp DEPENDS - libc.src.math.tanf16 + libc.src.math.tanf16 ) add_fp_unittest( diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index e23e7f41222d4a..e4501eb75fa48a 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -129,7 +129,7 @@ add_fp_unittest( tanf16_test.cpp DEPENDS libc.src.errno.errno - libc.src.math.tanf16 + libc.src.math.tanf16 ) add_fp_unittest( diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt index 87aa42faae8e5f..a27f5b8f1000e9 100644 --- a/libc/test/src/signal/CMakeLists.txt +++ b/libc/test/src/signal/CMakeLists.txt @@ -62,7 +62,7 @@ add_libc_unittest( sigaddset_test.cpp DEPENDS libc.include.signal - libc.src.errno.errno + libc.src.errno.errno libc.src.signal.sigaddset libc.test.UnitTest.ErrnoSetterMatcher ) @@ -89,7 +89,7 @@ add_libc_unittest( sigfillset_test.cpp DEPENDS libc.include.signal - libc.src.errno.errno + libc.src.errno.errno libc.src.signal.raise libc.src.signal.sigfillset libc.src.signal.sigprocmask @@ -104,7 +104,7 @@ add_libc_unittest( sigdelset_test.cpp DEPENDS libc.include.signal - libc.src.errno.errno + libc.src.errno.errno libc.src.signal.raise libc.src.signal.sigdelset libc.src.signal.sigfillset diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt index 60e38c9098c387..90d20438edb4be 100644 --- a/libc/test/src/stdfix/CMakeLists.txt +++ b/libc/test/src/stdfix/CMakeLists.txt @@ -57,7 +57,7 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) libc.src.stdfix.round${suffix} libc.src.__support.fixed_point.fx_bits ) - + add_libc_test( ${suffix}bits_test SUITE diff --git a/libc/test/src/sys/resource/CMakeLists.txt b/libc/test/src/sys/resource/CMakeLists.txt index 32186de4b111be..0a0f35bcbe556e 100644 --- a/libc/test/src/sys/resource/CMakeLists.txt +++ b/libc/test/src/sys/resource/CMakeLists.txt @@ -11,7 +11,7 @@ add_libc_unittest( DEPENDS libc.include.fcntl libc.include.sys_resource - libc.src.errno.errno + libc.src.errno.errno libc.src.fcntl.open libc.src.sys.resource.getrlimit libc.src.sys.resource.setrlimit diff --git a/libc/test/src/sys/select/CMakeLists.txt b/libc/test/src/sys/select/CMakeLists.txt index 2b465d32c2c33e..ff677926c7b6e1 100644 --- a/libc/test/src/sys/select/CMakeLists.txt +++ b/libc/test/src/sys/select/CMakeLists.txt @@ -9,7 +9,7 @@ add_libc_unittest( select_ui_test.cpp DEPENDS libc.include.unistd - libc.src.errno.errno + libc.src.errno.errno libc.src.sys.select.select libc.src.unistd.read ) @@ -22,7 +22,7 @@ add_libc_unittest( select_failure_test.cpp DEPENDS libc.include.unistd - libc.src.errno.errno + libc.src.errno.errno libc.src.sys.select.select libc.src.unistd.read libc.test.UnitTest.ErrnoSetterMatcher diff --git a/libc/test/src/sys/sendfile/CMakeLists.txt b/libc/test/src/sys/sendfile/CMakeLists.txt index ceaa4accdd06ef..06c61bca4255eb 100644 --- a/libc/test/src/sys/sendfile/CMakeLists.txt +++ b/libc/test/src/sys/sendfile/CMakeLists.txt @@ -11,7 +11,7 @@ add_libc_unittest( DEPENDS libc.hdr.fcntl_macros libc.include.sys_stat - libc.src.errno.errno + libc.src.errno.errno libc.src.fcntl.open libc.src.sys.sendfile.sendfile libc.src.unistd.close diff --git a/libc/test/src/sys/wait/CMakeLists.txt b/libc/test/src/sys/wait/CMakeLists.txt index db737a46f0d0f3..9acd74d9ce32c5 100644 --- a/libc/test/src/sys/wait/CMakeLists.txt +++ b/libc/test/src/sys/wait/CMakeLists.txt @@ -8,7 +8,7 @@ add_libc_unittest( waitpid_test.cpp DEPENDS libc.include.sys_wait - libc.src.errno.errno + libc.src.errno.errno libc.src.sys.wait.waitpid ) @@ -20,6 +20,6 @@ add_libc_unittest( wait4_test.cpp DEPENDS libc.include.sys_wait - libc.src.errno.errno + libc.src.errno.errno libc.src.sys.wait.wait4 ) diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index c3eebdf2a877d3..b01cce931a1ebf 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -29,7 +29,7 @@ add_libc_unittest( libc.src.errno.errno libc.src.unistd.chdir libc.src.unistd.close - libc.src.fcntl.open + libc.src.fcntl.open libc.test.UnitTest.ErrnoSetterMatcher ) @@ -427,7 +427,7 @@ add_libc_unittest( libc.include.unistd libc.hdr.fcntl_macros libc.include.sys_syscall - libc.src.errno.errno + libc.src.errno.errno libc.src.unistd.__llvm_libc_syscall libc.test.UnitTest.ErrnoSetterMatcher ) diff --git a/libc/utils/docgen/aio.yaml b/libc/utils/docgen/aio.yaml index 2c381558676a18..e30c76c9dc05c6 100644 --- a/libc/utils/docgen/aio.yaml +++ b/libc/utils/docgen/aio.yaml @@ -15,7 +15,7 @@ macros: in-latest-posix: '' LIO_WRITE: in-latest-posix: '' - + functions: aio_cancel: in-latest-posix: '' diff --git a/libc/utils/docgen/net/if.yaml b/libc/utils/docgen/net/if.yaml index 085d27b2bb94fa..ad0ad62b3f9a90 100644 --- a/libc/utils/docgen/net/if.yaml +++ b/libc/utils/docgen/net/if.yaml @@ -10,4 +10,4 @@ functions: macros: IF_NAMESIZE: - in-latest-posix: '' \ No newline at end of file + in-latest-posix: '' diff --git a/libc/utils/docgen/netinet/in.yaml b/libc/utils/docgen/netinet/in.yaml index 69cab901818419..513a4eda689eea 100644 --- a/libc/utils/docgen/netinet/in.yaml +++ b/libc/utils/docgen/netinet/in.yaml @@ -56,4 +56,4 @@ macros: IN6_IS_ADDR_MC_ORGLOCAL: in-latest-posix: '' IN6_IS_ADDR_MC_GLOBAL: - in-latest-posix: '' \ No newline at end of file + in-latest-posix: '' diff --git a/libc/utils/docgen/sys/resource.yaml b/libc/utils/docgen/sys/resource.yaml index 0b8ac219853267..2d7c34133b9d1c 100644 --- a/libc/utils/docgen/sys/resource.yaml +++ b/libc/utils/docgen/sys/resource.yaml @@ -40,4 +40,4 @@ macros: RUSAGE_SELF: in-latest-posix: '' RUSAGE_CHILDREN: - in-latest-posix: '' \ No newline at end of file + in-latest-posix: '' diff --git a/libc/utils/docgen/sys/stat.yaml b/libc/utils/docgen/sys/stat.yaml index 86dc84a1e06d2e..7d38dabac13188 100644 --- a/libc/utils/docgen/sys/stat.yaml +++ b/libc/utils/docgen/sys/stat.yaml @@ -42,7 +42,7 @@ macros: in-latest-posix: '' S_IXGRP: in-latest-posix: '' - + S_IRWXO: in-latest-posix: '' S_IROTH: @@ -115,4 +115,4 @@ functions: umask: in-latest-posix: '' utimensat: - in-latest-posix: '' \ No newline at end of file + in-latest-posix: '' diff --git a/libc/utils/docgen/sys/time.yaml b/libc/utils/docgen/sys/time.yaml index a1d19c3fc636cd..1e3909adafea39 100644 --- a/libc/utils/docgen/sys/time.yaml +++ b/libc/utils/docgen/sys/time.yaml @@ -2,4 +2,4 @@ functions: select: in-latest-posix: '' utimes: - in-latest-posix: '' \ No newline at end of file + in-latest-posix: '' diff --git a/libc/utils/docgen/sys/wait.yaml b/libc/utils/docgen/sys/wait.yaml index 91d67ad4a358bf..3a50638ea4ac94 100644 --- a/libc/utils/docgen/sys/wait.yaml +++ b/libc/utils/docgen/sys/wait.yaml @@ -34,4 +34,4 @@ macros: WUNTRACED: in-latest-posix: '' WCORE_DUMPED: - in-latest-posix: '' \ No newline at end of file + in-latest-posix: '' diff --git a/libc/utils/docgen/termios.yaml b/libc/utils/docgen/termios.yaml index 81dd8da9f240cc..107b36423de094 100644 --- a/libc/utils/docgen/termios.yaml +++ b/libc/utils/docgen/termios.yaml @@ -31,7 +31,7 @@ macros: in-latest-posix: '' IGNBRK: in-latest-posix: '' - IGNCR: + IGNCR: in-latest-posix: '' IGNPAR: in-latest-posix: '' @@ -64,7 +64,7 @@ macros: in-latest-posix: '' OFILL: in-latest-posix: '' - + NLDLY: in-latest-posix: '' NL0: @@ -139,7 +139,7 @@ macros: in-latest-posix: '' B2400: in-latest-posix: '' - B4800: + B4800: in-latest-posix: '' B9600: in-latest-posix: '' @@ -231,13 +231,13 @@ functions: in-latest-posix: '' tcgetattr: in-latest-posix: '' - tcgetsid: + tcgetsid: in-latest-posix: '' tcgetwinsize: in-latest-posix: '' tcsendbreak: in-latest-posix: '' - tcsetattr: + tcsetattr: in-latest-posix: '' tcsetwinsize: - in-latest-posix: '' \ No newline at end of file + in-latest-posix: '' diff --git a/libc/utils/mathtools/worst_case.sollya b/libc/utils/mathtools/worst_case.sollya index 3a8d11b3da44d5..7cb549c7602576 100644 --- a/libc/utils/mathtools/worst_case.sollya +++ b/libc/utils/mathtools/worst_case.sollya @@ -1,6 +1,6 @@ // Implement WorstCase functions to compute the worst case for x mod C, with // the exponent of x ranges from emin to emax, and precision of x is p. -// Adapted to Sollya from the Maple function in +// Adapted to Sollya from the Maple function in // J-M. Muller, "Elementary Functions", 3rd ed, Section 11.3.2. // // Some examples: @@ -8,7 +8,7 @@ // 1) Worst case for trig range reduction fast passes: // // Single precision -// > WorstCase(24, -6, 32, pi/32, 128); +// > WorstCase(24, -6, 32, pi/32, 128); // numbermin : 10741887 // expmin : 7 // Worst case: 0x1.47d0fep30 From 2f6b0b4a8522b540de07c9ebd3446433e7d99eb6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 23 Jan 2025 10:03:33 -0800 Subject: [PATCH 190/208] [RISCV] Add SiFive sf.vqmacc tests to vmv-copy.mir. NFC (#124075) The vqmaccu.2x8x2 test is currently being miscompiled. We need to use a whole register move instead of vmv.v.v. The input has VL elements with EEW=8 EMUL=4. The output has VL/4 elements with EEW=32 EMUL=4. We can't use the original VL or input SEW for a vmv.v.v. --- llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir index f7d5004e11752f..fad2b1b325a48c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir @@ -280,6 +280,38 @@ body: | $v24_v25_v26_v27_v28_v29_v30_v31 = COPY killed $v8_v9_v10_v11_v12_v13_v14_v15 ... --- +name: copy_sifive_custom_macc +tracksRegLiveness: true +body: | + bb.0: + liveins: $x2, $x10, $v8, $v13, $v4m4, $v16m4 + ; CHECK-LABEL: name: copy_sifive_custom_macc + ; CHECK: liveins: $x2, $x10, $v8, $v13, $v4m4, $v16m4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = PseudoVSETVLI $x10, 66 /* e8, m4, ta, mu */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: early-clobber $v4m4 = PseudoVQMACCUS_2x8x2_M4 renamable $v4m4, killed renamable $v13, killed renamable $v16m4, $noreg, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v16m4 = PseudoVMV_V_V_M4 undef $v16m4, $v4m4, $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + $x0 = PseudoVSETVLI $x10, 66, implicit-def $vl, implicit-def $vtype + early-clobber $v4m4 = PseudoVQMACCUS_2x8x2_M4 renamable $v4m4, killed renamable $v13, killed renamable $v16m4, $noreg, 3, 1, implicit $vl, implicit $vtype + $v16m4 = COPY renamable $v4m4 +... +--- +name: copy_sifive_custom_macc1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x2, $x10, $v8, $v13, $v4m4, $v16m2 + ; CHECK-LABEL: name: copy_sifive_custom_macc1 + ; CHECK: liveins: $x2, $x10, $v8, $v13, $v4m4, $v16m2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = PseudoVSETVLI $x10, 65 /* e8, m2, ta, mu */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: early-clobber $v4m4 = PseudoVQMACCUS_4x8x4_M2 renamable $v4m4, killed renamable $v13, killed renamable $v16m2, $noreg, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v16m4 = VMV4R_V $v4m4, implicit $vtype + $x0 = PseudoVSETVLI $x10, 65, implicit-def $vl, implicit-def $vtype + early-clobber $v4m4 = PseudoVQMACCUS_4x8x4_M2 renamable $v4m4, killed renamable $v13, killed renamable $v16m2, $noreg, 3, 1, implicit $vl, implicit $vtype + $v16m4 = COPY renamable $v4m4 +... +--- name: copy_narrow_copies_in_between tracksRegLiveness: true body: | From bec4c7f5f7fb044dbc7b134a00f4cf29b5cb2b48 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 23 Jan 2025 18:04:27 +0000 Subject: [PATCH 191/208] [InstCombine] Unpack scalable struct loads/stores. (#123986) This teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the geps for scalable structs are not supported and we canonicalize to i8. --- .../InstCombineLoadStoreAlloca.cpp | 38 ++++------ .../InstCombine/scalable-vector-struct.ll | 70 +++++++++++++++---- 2 files changed, 67 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 61f1c17592e966..f80bbffbab547e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -704,29 +704,22 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) { const DataLayout &DL = IC.getDataLayout(); auto *SL = DL.getStructLayout(ST); - // Don't unpack for structure with scalable vector. - if (SL->getSizeInBits().isScalable()) - return nullptr; - if (SL->hasPadding()) return nullptr; const auto Align = LI.getAlign(); auto *Addr = LI.getPointerOperand(); - auto *IdxType = Type::getInt32Ty(T->getContext()); - auto *Zero = ConstantInt::get(IdxType, 0); + auto *IdxType = DL.getIndexType(Addr->getType()); Value *V = PoisonValue::get(T); for (unsigned i = 0; i < NumElements; i++) { - Value *Indices[2] = { - Zero, - ConstantInt::get(IdxType, i), - }; - auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices), - Name + ".elt"); + auto *Ptr = IC.Builder.CreateInBoundsPtrAdd( + Addr, IC.Builder.CreateTypeSize(IdxType, SL->getElementOffset(i)), + Name + ".elt"); auto *L = IC.Builder.CreateAlignedLoad( ST->getElementType(i), Ptr, - commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack"); + commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue()), + Name + ".unpack"); // Propagate AA metadata. It'll still be valid on the narrowed load. L->setAAMetadata(LI.getAAMetadata()); V = IC.Builder.CreateInsertValue(V, L, i); @@ -1222,10 +1215,6 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) { const DataLayout &DL = IC.getDataLayout(); auto *SL = DL.getStructLayout(ST); - // Don't unpack for structure with scalable vector. - if (SL->getSizeInBits().isScalable()) - return false; - if (SL->hasPadding()) return false; @@ -1237,17 +1226,14 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) { SmallString<16> AddrName = Addr->getName(); AddrName += ".repack"; - auto *IdxType = Type::getInt32Ty(ST->getContext()); - auto *Zero = ConstantInt::get(IdxType, 0); + auto *IdxType = DL.getIndexType(Addr->getType()); for (unsigned i = 0; i < Count; i++) { - Value *Indices[2] = { - Zero, - ConstantInt::get(IdxType, i), - }; - auto *Ptr = - IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices), AddrName); + auto *Ptr = IC.Builder.CreateInBoundsPtrAdd( + Addr, IC.Builder.CreateTypeSize(IdxType, SL->getElementOffset(i)), + AddrName); auto *Val = IC.Builder.CreateExtractValue(V, i, EltName); - auto EltAlign = commonAlignment(Align, SL->getElementOffset(i)); + auto EltAlign = + commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue()); llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign); NS->setAAMetadata(SI.getAAMetadata()); } diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll index e7d4a3b7d20444..a8790b579d75a5 100644 --- a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll +++ b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll @@ -6,9 +6,11 @@ define @load(ptr %x) { ; CHECK-LABEL: define @load ; CHECK-SAME: (ptr [[X:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = load [[STRUCT_TEST:%.*]], ptr [[X]], align 4 -; CHECK-NEXT: [[B:%.*]] = extractvalue [[STRUCT_TEST]] [[A]], 1 -; CHECK-NEXT: ret [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 +; CHECK-NEXT: [[A_ELT1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]] +; CHECK-NEXT: [[A_UNPACK2:%.*]] = load , ptr [[A_ELT1]], align 4 +; CHECK-NEXT: ret [[A_UNPACK2]] ; %a = load %struct.test, ptr %x %b = extractvalue %struct.test %a, 1 @@ -18,9 +20,11 @@ define @load(ptr %x) { define void @store(ptr %x, %y, %z) { ; CHECK-LABEL: define void @store ; CHECK-SAME: (ptr [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = insertvalue [[STRUCT_TEST:%.*]] undef, [[Y]], 0 -; CHECK-NEXT: [[B:%.*]] = insertvalue [[STRUCT_TEST]] [[A]], [[Z]], 1 -; CHECK-NEXT: store [[STRUCT_TEST]] [[B]], ptr [[X]], align 4 +; CHECK-NEXT: store [[Y]], ptr [[X]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 +; CHECK-NEXT: [[X_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]] +; CHECK-NEXT: store [[Z]], ptr [[X_REPACK1]], align 4 ; CHECK-NEXT: ret void ; %a = insertvalue %struct.test undef, %y, 0 @@ -33,8 +37,14 @@ define {, } @split_load(ptr %p) nounwind { ; CHECK-LABEL: define { , } @split_load ; CHECK-SAME: (ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = load { , }, ptr [[P]], align 16 -; CHECK-NEXT: ret { , } [[R]] +; CHECK-NEXT: [[R_UNPACK:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[R_UNPACK]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 4 +; CHECK-NEXT: [[R_ELT1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP2]] +; CHECK-NEXT: [[R_UNPACK2:%.*]] = load , ptr [[R_ELT1]], align 16 +; CHECK-NEXT: [[R3:%.*]] = insertvalue { , } [[TMP0]], [[R_UNPACK2]], 1 +; CHECK-NEXT: ret { , } [[R3]] ; entry: %r = load {, }, ptr %p @@ -58,7 +68,13 @@ define void @split_store({, } %x, ptr %p) no ; CHECK-LABEL: define void @split_store ; CHECK-SAME: ({ , } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: store { , } [[X]], ptr [[P]], align 16 +; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { , } [[X]], 0 +; CHECK-NEXT: store [[X_ELT]], ptr [[P]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]] +; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { , } [[X]], 1 +; CHECK-NEXT: store [[X_ELT2]], ptr [[P_REPACK1]], align 16 ; CHECK-NEXT: ret void ; entry: @@ -104,9 +120,21 @@ define {, } @check_nxv16i8_nxv4i32({, } @check_nxv16i8_nxv4i32 ; CHECK-SAME: ({ , } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: store { , } [[X]], ptr [[P]], align 16 -; CHECK-NEXT: [[R:%.*]] = load { , }, ptr [[P]], align 16 -; CHECK-NEXT: ret { , } [[R]] +; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { , } [[X]], 0 +; CHECK-NEXT: store [[X_ELT]], ptr [[P]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]] +; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { , } [[X]], 1 +; CHECK-NEXT: store [[X_ELT2]], ptr [[P_REPACK1]], align 16 +; CHECK-NEXT: [[R_UNPACK:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , } poison, [[R_UNPACK]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 4 +; CHECK-NEXT: [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]] +; CHECK-NEXT: [[R_UNPACK4:%.*]] = load , ptr [[R_ELT3]], align 16 +; CHECK-NEXT: [[R5:%.*]] = insertvalue { , } [[TMP2]], [[R_UNPACK4]], 1 +; CHECK-NEXT: ret { , } [[R5]] ; entry: store {, } %x, ptr %p @@ -119,9 +147,21 @@ define {, } @alloca_nxv16i8_nxv4i32({, } [[X:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P:%.*]] = alloca { , }, align 16 -; CHECK-NEXT: store { , } [[X]], ptr [[P]], align 16 -; CHECK-NEXT: [[R:%.*]] = load { , }, ptr [[P]], align 16 -; CHECK-NEXT: ret { , } [[R]] +; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { , } [[X]], 0 +; CHECK-NEXT: store [[X_ELT]], ptr [[P]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]] +; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { , } [[X]], 1 +; CHECK-NEXT: store [[X_ELT2]], ptr [[P_REPACK1]], align 16 +; CHECK-NEXT: [[R_UNPACK:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , } poison, [[R_UNPACK]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 4 +; CHECK-NEXT: [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]] +; CHECK-NEXT: [[R_UNPACK4:%.*]] = load , ptr [[R_ELT3]], align 16 +; CHECK-NEXT: [[R5:%.*]] = insertvalue { , } [[TMP2]], [[R_UNPACK4]], 1 +; CHECK-NEXT: ret { , } [[R5]] ; entry: %p = alloca {, } From 3dec24d2a284e98c8a12b8ec9d0a80a657b2d992 Mon Sep 17 00:00:00 2001 From: mingmingl Date: Thu, 23 Jan 2025 10:11:11 -0800 Subject: [PATCH 192/208] Stats are sorted before they are printed. Try fixing test failure by checking stats in its print order. --- llvm/test/CodeGen/X86/jump-table-partition.ll | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/X86/jump-table-partition.ll b/llvm/test/CodeGen/X86/jump-table-partition.ll index 6a6aa00fa7fa0a..2fd5d523a10cec 100644 --- a/llvm/test/CodeGen/X86/jump-table-partition.ll +++ b/llvm/test/CodeGen/X86/jump-table-partition.ll @@ -1,9 +1,6 @@ ; -stats requires asserts ; requires: asserts -; COM: Fix test failures on certain environments and re-enable the test. -; UNSUPPORTED: target={{.*}} - ; Stop after 'finalize-isel' for simpler MIR, and lower the minimum number of ; jump table entries so 'switch' needs fewer cases to generate a jump table. ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -stop-after=finalize-isel -min-jump-table-entries=2 %s -o %t.mir @@ -13,9 +10,9 @@ ; COM: Update test to verify section suffixes when target-lowering and assembler changes are implemented. ; COM: Also run static-data-splitter pass with -static-data-default-hotness=cold and check data section suffix. -; STAT-DAG: 2 static-data-splitter - Number of cold jump tables seen -; STAT-DAG: 2 static-data-splitter - Number of hot jump tables seen -; STAT-DAG: 1 static-data-splitter - Number of jump tables with unknown hotness +; STAT: 2 static-data-splitter - Number of cold jump tables seen +; STAT: 2 static-data-splitter - Number of hot jump tables seen +; STAT: 1 static-data-splitter - Number of jump tables with unknown hotness ; In function @foo, the 2 switch instructions to jt0.* and jt1.* get lowered to hot jump tables, ; and the 2 switch instructions to jt2.* and jt3.* get lowered to cold jump tables. From 6d5697f7cb4e933d2f176c46b7ac05a9cbaeb8b6 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Thu, 23 Jan 2025 19:11:18 +0100 Subject: [PATCH 193/208] [SystemZ] Fix ICE with i128->i64 uaddo carry chain We can only optimize a uaddo_carry via specialized instruction if the carry was produced by another uaddo(_carry) instruction; there is already a check for that. However, i128 uaddo(_carry) use a completely different mechanism; they indicate carry in a vector register instead of the CC flag. Thus, we must also check that we don't mix those two - that check has been missing. Fixes: https://github.com/llvm/llvm-project/issues/124001 --- .../Target/SystemZ/SystemZISelLowering.cpp | 12 ++++++---- llvm/test/CodeGen/SystemZ/pr124001.ll | 23 +++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/SystemZ/pr124001.ll diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 4040ab6d45103a..1fb31c26e20d3c 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4708,15 +4708,19 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op, } static bool isAddCarryChain(SDValue Carry) { - while (Carry.getOpcode() == ISD::UADDO_CARRY) + while (Carry.getOpcode() == ISD::UADDO_CARRY && + Carry->getValueType(0) != MVT::i128) Carry = Carry.getOperand(2); - return Carry.getOpcode() == ISD::UADDO; + return Carry.getOpcode() == ISD::UADDO && + Carry->getValueType(0) != MVT::i128; } static bool isSubBorrowChain(SDValue Carry) { - while (Carry.getOpcode() == ISD::USUBO_CARRY) + while (Carry.getOpcode() == ISD::USUBO_CARRY && + Carry->getValueType(0) != MVT::i128) Carry = Carry.getOperand(2); - return Carry.getOpcode() == ISD::USUBO; + return Carry.getOpcode() == ISD::USUBO && + Carry->getValueType(0) != MVT::i128; } // Lower UADDO_CARRY/USUBO_CARRY nodes. diff --git a/llvm/test/CodeGen/SystemZ/pr124001.ll b/llvm/test/CodeGen/SystemZ/pr124001.ll new file mode 100644 index 00000000000000..9cf630a55dd650 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/pr124001.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define i64 @test(i128 %in) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vaccq %v0, %v0, %v1 +; CHECK-NEXT: vlgvg %r1, %v0, 1 +; CHECK-NEXT: la %r2, 1(%r1) +; CHECK-NEXT: br %r14 + %1 = tail call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %in, i128 1) + %2 = extractvalue { i128, i1 } %1, 1 + %3 = zext i1 %2 to i64 + %4 = add i64 %3, 1 + ret i64 %4 +} + +declare { i128, i1 } @llvm.uadd.with.overflow.i128(i128, i128) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } From 0d0190815d8f273e9d87c29b4779b81412b31e91 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 23 Jan 2025 18:24:20 +0000 Subject: [PATCH 194/208] [TailDup] Allow large number of predecessors/successors without phis. (#116072) This adjusts the threshold logic added in #78582 to only trigger for cases where there are actually phis to duplicate in either TailBB or in one of the successors. In cases there are no phis, we only have to pay the cost of extra edges, but have no explosion in PHI related instructions. This improves performance of Python on some inputs by 2-3% on Apple Silicon CPUs. PR: https://github.com/llvm/llvm-project/pull/116072 --- llvm/lib/CodeGen/TailDuplicator.cpp | 23 ++++--- .../CodeGen/X86/tail-dup-pred-succ-size.mir | 60 ++++++++++--------- 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index f5346c8805733c..6c6d38462484a0 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -573,14 +573,6 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, if (TailBB.isSuccessor(&TailBB)) return false; - // Duplicating a BB which has both multiple predecessors and successors will - // result in a complex CFG and also may cause huge amount of PHI nodes. If we - // want to remove this limitation, we have to address - // https://github.com/llvm/llvm-project/issues/78578. - if (TailBB.pred_size() > TailDupPredSize && - TailBB.succ_size() > TailDupSuccSize) - return false; - // Set the limit on the cost to duplicate. When optimizing for size, // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. @@ -618,6 +610,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, // Check the instructions in the block to determine whether tail-duplication // is invalid or unlikely to be profitable. unsigned InstrCount = 0; + unsigned NumPhis = 0; for (MachineInstr &MI : TailBB) { // Non-duplicable things shouldn't be tail-duplicated. // CFI instructions are marked as non-duplicable, because Darwin compact @@ -661,6 +654,20 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, if (InstrCount > MaxDuplicateCount) return false; + NumPhis += MI.isPHI(); + } + + // Duplicating a BB which has both multiple predecessors and successors will + // may cause huge amount of PHI nodes. If we want to remove this limitation, + // we have to address https://github.com/llvm/llvm-project/issues/78578. + if (TailBB.pred_size() > TailDupPredSize && + TailBB.succ_size() > TailDupSuccSize) { + // If TailBB or any of its successors contains a phi, we may have to add a + // large number of additional phis with additional incoming values. + if (NumPhis != 0 || any_of(TailBB.successors(), [](MachineBasicBlock *MBB) { + return any_of(*MBB, [](MachineInstr &MI) { return MI.isPHI(); }); + })) + return false; } // Check if any of the successors of TailBB has a PHI node in which the diff --git a/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir index 1d17672e2c6bd0..2f1ff76fda76c9 100644 --- a/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir +++ b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir @@ -538,43 +538,47 @@ body: | ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG]], %jump-table.0, $noreg ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.2: - ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg - ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri1]], 7, implicit-def dead $eflags + ; LIMIT-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri1]], %subreg.sub_32bit + ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.3: - ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg - ; LIMIT-NEXT: [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags - ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags + ; LIMIT-NEXT: [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: [[AND32ri2:%[0-9]+]]:gr32 = AND32ri [[SHR32ri3]], 7, implicit-def dead $eflags + ; LIMIT-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri2]], %subreg.sub_32bit + ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG2]], %jump-table.1, $noreg ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.4: - ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg - ; LIMIT-NEXT: [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags - ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: [[AND32ri3:%[0-9]+]]:gr32 = AND32ri [[SHR32ri5]], 7, implicit-def dead $eflags + ; LIMIT-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri3]], %subreg.sub_32bit + ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG3]], %jump-table.1, $noreg ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.5: - ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg - ; LIMIT-NEXT: [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags - ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags + ; LIMIT-NEXT: [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: [[AND32ri4:%[0-9]+]]:gr32 = AND32ri [[SHR32ri7]], 7, implicit-def dead $eflags + ; LIMIT-NEXT: [[SUBREG_TO_REG4:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri4]], %subreg.sub_32bit + ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG4]], %jump-table.1, $noreg ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.6: ; LIMIT-NEXT: successors: ; LIMIT-NEXT: {{ $}} - ; LIMIT-NEXT: bb.7: - ; LIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) - ; LIMIT-NEXT: {{ $}} - ; LIMIT-NEXT: [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags - ; LIMIT-NEXT: [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri4]], 7, implicit-def dead $eflags - ; LIMIT-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri1]], %subreg.sub_32bit - ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg - ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.9: ; LIMIT-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg ; LIMIT-NEXT: MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[MOV32rm4]] :: (store (s32)) @@ -583,23 +587,23 @@ body: | ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.10: ; LIMIT-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg - ; LIMIT-NEXT: [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags - ; LIMIT-NEXT: MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri5]] :: (store (s32)) - ; LIMIT-NEXT: $eax = COPY [[SHR32ri5]] + ; LIMIT-NEXT: [[SHR32ri8:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags + ; LIMIT-NEXT: MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri8]] :: (store (s32)) + ; LIMIT-NEXT: $eax = COPY [[SHR32ri8]] ; LIMIT-NEXT: RET 0, $eax ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.11: ; LIMIT-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg - ; LIMIT-NEXT: [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags - ; LIMIT-NEXT: MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri6]] :: (store (s32)) - ; LIMIT-NEXT: $eax = COPY [[SHR32ri6]] + ; LIMIT-NEXT: [[SHR32ri9:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri9]] :: (store (s32)) + ; LIMIT-NEXT: $eax = COPY [[SHR32ri9]] ; LIMIT-NEXT: RET 0, $eax ; LIMIT-NEXT: {{ $}} ; LIMIT-NEXT: bb.12: ; LIMIT-NEXT: [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg - ; LIMIT-NEXT: [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags - ; LIMIT-NEXT: MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri7]] :: (store (s32)) - ; LIMIT-NEXT: $eax = COPY [[SHR32ri7]] + ; LIMIT-NEXT: [[SHR32ri10:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags + ; LIMIT-NEXT: MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri10]] :: (store (s32)) + ; LIMIT-NEXT: $eax = COPY [[SHR32ri10]] ; LIMIT-NEXT: RET 0, $eax ; ; NOLIMIT-LABEL: name: foo_no_phis From 1c28b9237382b093f477479c993c80181922ca6a Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Fri, 24 Jan 2025 03:25:54 +0900 Subject: [PATCH 195/208] [Clang] __has_builtin should return false for aux triple builtins (#121839) Currently, `__has_builtin` will return true when passed a builtin that is only supported on the aux target. I found this when `__has_builtin` was called with an X86 builtin but the current target was SPIR-V. We should instead return false for aux builtins. --------- Signed-off-by: Sarnie, Nick --- clang/lib/Lex/PPMacroExpansion.cpp | 10 +++++++--- clang/test/Headers/__cpuidex_conflict.c | 4 +++- clang/test/Preprocessor/builtin_aux_info.cpp | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) create mode 100644 clang/test/Preprocessor/builtin_aux_info.cpp diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp index 347c13da0ad215..9cf29668f251fc 100644 --- a/clang/lib/Lex/PPMacroExpansion.cpp +++ b/clang/lib/Lex/PPMacroExpansion.cpp @@ -1804,8 +1804,9 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) { diag::err_feature_check_malformed); if (!II) return false; - else if (II->getBuiltinID() != 0) { - switch (II->getBuiltinID()) { + auto BuiltinID = II->getBuiltinID(); + if (BuiltinID != 0) { + switch (BuiltinID) { case Builtin::BI__builtin_cpu_is: return getTargetInfo().supportsCpuIs(); case Builtin::BI__builtin_cpu_init: @@ -1818,8 +1819,11 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) { // usual allocation and deallocation functions. Required by libc++ return 201802; default: + // __has_builtin should return false for aux builtins. + if (getBuiltinInfo().isAuxBuiltinID(BuiltinID)) + return false; return Builtin::evaluateRequiredTargetFeatures( - getBuiltinInfo().getRequiredFeatures(II->getBuiltinID()), + getBuiltinInfo().getRequiredFeatures(BuiltinID), getTargetInfo().getTargetOpts().FeatureMap); } return true; diff --git a/clang/test/Headers/__cpuidex_conflict.c b/clang/test/Headers/__cpuidex_conflict.c index 8687a6aa2f897a..0f5e6e5e0a0ff4 100644 --- a/clang/test/Headers/__cpuidex_conflict.c +++ b/clang/test/Headers/__cpuidex_conflict.c @@ -3,7 +3,9 @@ // RUN: %clang_cc1 %s -ffreestanding -fms-extensions -fms-compatibility \ // RUN: -fms-compatibility-version=19.00 -triple x86_64-pc-windows-msvc -emit-llvm -o - // %clang_cc1 %s -ffreestanding -triple x86_64-w64-windows-gnu -fms-extensions -emit-llvm -o - -// RUN: %clang_cc1 %s -ffreestanding -fopenmp -fopenmp-is-target-device -aux-triple x86_64-unknown-linux-gnu +// +// FIXME: See https://github.com/llvm/llvm-project/pull/121839 +// RUN: not %clang_cc1 %s -ffreestanding -fopenmp -fopenmp-is-target-device -aux-triple x86_64-unknown-linux-gnu typedef __SIZE_TYPE__ size_t; diff --git a/clang/test/Preprocessor/builtin_aux_info.cpp b/clang/test/Preprocessor/builtin_aux_info.cpp new file mode 100644 index 00000000000000..60c8c6c492479a --- /dev/null +++ b/clang/test/Preprocessor/builtin_aux_info.cpp @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -fopenmp -triple=spirv64 -fopenmp-is-target-device \ +// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s + +// RUN: %clang_cc1 -fopenmp -triple=nvptx64 -fopenmp-is-target-device \ +// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s + +// RUN: %clang_cc1 -fopenmp -triple=amdgcn-amd-amdhsa -fopenmp-is-target-device \ +// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s + +// RUN: %clang_cc1 -fopenmp -triple=aarch64 -fopenmp-is-target-device \ +// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s + +// CHECK: GOOD +#if __has_builtin(__builtin_ia32_pause) + BAD +#else + GOOD +#endif From a6211a64dc22b11daa6f01122350a7287a593159 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 23 Jan 2025 18:28:02 +0000 Subject: [PATCH 196/208] [gn build] Port ff17a4136ded --- llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn index d74de409858b95..898e7470ec2eba 100644 --- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn @@ -114,10 +114,7 @@ static_library("Host") { ] } if (current_os == "android") { - sources += [ - "android/HostInfoAndroid.cpp", - "android/LibcGlue.cpp", - ] + sources += [ "android/HostInfoAndroid.cpp" ] } if (current_os == "freebsd") { From 4cf1fe240589d3f2a8a8332abf3f71a18bdba027 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 23 Jan 2025 10:37:11 -0800 Subject: [PATCH 197/208] [lldb] Add missing operations to GetOpcodeDataSize (#120163) The improved error reporting in #120162 revealed that we were missing opcodes in GetOpcodeDataSize. I changed the function to remove the default case and switch over the enum type which will cause the compiler to emit a warning if there are unhandled operations in the future. rdar://139705570 --- lldb/source/Expression/DWARFExpression.cpp | 76 ++++++++++++++++++---- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp index 1d826e341e2c44..f48f3ab9307dd1 100644 --- a/lldb/source/Expression/DWARFExpression.cpp +++ b/lldb/source/Expression/DWARFExpression.cpp @@ -132,10 +132,35 @@ static llvm::Error ReadRegisterValueAsScalar(RegisterContext *reg_ctx, /// are made on the state of \p data after this call. static lldb::offset_t GetOpcodeDataSize(const DataExtractor &data, const lldb::offset_t data_offset, - const uint8_t op, + const LocationAtom op, const DWARFUnit *dwarf_cu) { lldb::offset_t offset = data_offset; switch (op) { + // Only used in LLVM metadata. + case DW_OP_LLVM_fragment: + case DW_OP_LLVM_convert: + case DW_OP_LLVM_tag_offset: + case DW_OP_LLVM_entry_value: + case DW_OP_LLVM_implicit_pointer: + case DW_OP_LLVM_arg: + case DW_OP_LLVM_extract_bits_sext: + case DW_OP_LLVM_extract_bits_zext: + break; + // Vendor extensions: + case DW_OP_HP_is_value: + case DW_OP_HP_fltconst4: + case DW_OP_HP_fltconst8: + case DW_OP_HP_mod_range: + case DW_OP_HP_unmod_range: + case DW_OP_HP_tls: + case DW_OP_INTEL_bit_piece: + case DW_OP_WASM_location: + case DW_OP_WASM_location_int: + case DW_OP_APPLE_uninit: + case DW_OP_PGI_omp_thread_num: + case DW_OP_hi_user: + break; + case DW_OP_addr: case DW_OP_call_ref: // 0x9a 1 address sized offset of DIE (DWARF3) return data.GetAddressByteSize(); @@ -246,6 +271,7 @@ static lldb::offset_t GetOpcodeDataSize(const DataExtractor &data, case DW_OP_pick: // 0x15 1 1-byte stack index case DW_OP_deref_size: // 0x94 1 1-byte size of data retrieved case DW_OP_xderef_size: // 0x95 1 1-byte size of data retrieved + case DW_OP_deref_type: // 0xa6 1 1-byte constant return 1; // Opcodes with a single 2 byte arguments @@ -268,7 +294,6 @@ static lldb::offset_t GetOpcodeDataSize(const DataExtractor &data, return 8; // All opcodes that have a single ULEB (signed or unsigned) argument - case DW_OP_addrx: // 0xa1 1 ULEB128 index case DW_OP_constu: // 0x10 1 ULEB128 constant case DW_OP_consts: // 0x11 1 SLEB128 constant case DW_OP_plus_uconst: // 0x23 1 ULEB128 addend @@ -307,14 +332,20 @@ static lldb::offset_t GetOpcodeDataSize(const DataExtractor &data, case DW_OP_regx: // 0x90 1 ULEB128 register case DW_OP_fbreg: // 0x91 1 SLEB128 offset case DW_OP_piece: // 0x93 1 ULEB128 size of piece addressed + case DW_OP_convert: // 0xa8 1 ULEB128 offset + case DW_OP_reinterpret: // 0xa9 1 ULEB128 offset + case DW_OP_addrx: // 0xa1 1 ULEB128 index + case DW_OP_constx: // 0xa2 1 ULEB128 index + case DW_OP_xderef_type: // 0xa7 1 ULEB128 index case DW_OP_GNU_addr_index: // 0xfb 1 ULEB128 index case DW_OP_GNU_const_index: // 0xfc 1 ULEB128 index data.Skip_LEB128(&offset); return offset - data_offset; // All opcodes that have a 2 ULEB (signed or unsigned) arguments - case DW_OP_bregx: // 0x92 2 ULEB128 register followed by SLEB128 offset - case DW_OP_bit_piece: // 0x9d ULEB128 bit size, ULEB128 bit offset (DWARF3); + case DW_OP_bregx: // 0x92 2 ULEB128 register followed by SLEB128 offset + case DW_OP_bit_piece: // 0x9d ULEB128 bit size, ULEB128 bit offset (DWARF3); + case DW_OP_regval_type: // 0xa5 ULEB128 + ULEB128 data.Skip_LEB128(&offset); data.Skip_LEB128(&offset); return offset - data_offset; @@ -327,6 +358,13 @@ static lldb::offset_t GetOpcodeDataSize(const DataExtractor &data, return offset - data_offset; } + case DW_OP_implicit_pointer: // 0xa0 4-byte (or 8-byte for DWARF 64) constant + // + LEB128 + { + data.Skip_LEB128(&offset); + return DWARFUnit::GetAddressByteSize(dwarf_cu) + offset - data_offset; + } + case DW_OP_GNU_entry_value: case DW_OP_entry_value: // 0xa3 ULEB128 size + variable-length block { @@ -334,20 +372,32 @@ static lldb::offset_t GetOpcodeDataSize(const DataExtractor &data, return (offset - data_offset) + subexpr_len; } - default: - if (!dwarf_cu) { - return LLDB_INVALID_OFFSET; - } + case DW_OP_const_type: // 0xa4 ULEB128 + size + variable-length block + { + data.Skip_LEB128(&offset); + uint8_t length = data.GetU8(&offset); + return (offset - data_offset) + length; + } + + case DW_OP_LLVM_user: // 0xe9: ULEB128 + variable length constant + { + uint64_t constants = data.GetULEB128(&offset); + return (offset - data_offset) + constants; + } + } + + if (dwarf_cu) return dwarf_cu->GetSymbolFileDWARF().GetVendorDWARFOpcodeSize( data, data_offset, op); - } + + return LLDB_INVALID_OFFSET; } llvm::Expected DWARFExpression::GetLocation_DW_OP_addr(const DWARFUnit *dwarf_cu) const { lldb::offset_t offset = 0; while (m_data.ValidOffset(offset)) { - const uint8_t op = m_data.GetU8(&offset); + const LocationAtom op = static_cast(m_data.GetU8(&offset)); if (op == DW_OP_addr) return m_data.GetAddress(&offset); @@ -376,7 +426,7 @@ bool DWARFExpression::Update_DW_OP_addr(const DWARFUnit *dwarf_cu, lldb::addr_t file_addr) { lldb::offset_t offset = 0; while (m_data.ValidOffset(offset)) { - const uint8_t op = m_data.GetU8(&offset); + const LocationAtom op = static_cast(m_data.GetU8(&offset)); if (op == DW_OP_addr) { const uint32_t addr_byte_size = m_data.GetAddressByteSize(); @@ -434,7 +484,7 @@ bool DWARFExpression::ContainsThreadLocalStorage( const DWARFUnit *dwarf_cu) const { lldb::offset_t offset = 0; while (m_data.ValidOffset(offset)) { - const uint8_t op = m_data.GetU8(&offset); + const LocationAtom op = static_cast(m_data.GetU8(&offset)); if (op == DW_OP_form_tls_address || op == DW_OP_GNU_push_tls_address) return true; @@ -465,7 +515,7 @@ bool DWARFExpression::LinkThreadLocalStorage( lldb::addr_t const_value = 0; size_t const_byte_size = 0; while (m_data.ValidOffset(offset)) { - const uint8_t op = m_data.GetU8(&offset); + const LocationAtom op = static_cast(m_data.GetU8(&offset)); bool decoded_data = false; switch (op) { From 775d0f36f74851172f84074d90cde29e181b3edd Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 23 Jan 2025 18:43:50 +0000 Subject: [PATCH 198/208] [GVN] Handle scalable vectors with the same size in VNCoercion (#123984) This allows us to forward to a load even if the types do not match (nxv4i32 vs nxv2i64 for example). Scalable types are allowed in canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is the same, and some follow-on code is adjusted to make sure it handles scalable sizes correctly. Methods like analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still do nothing for scalable vectors, as Offsets and mismatching types are not supported. --- llvm/lib/Transforms/Utils/VNCoercion.cpp | 30 ++++++++++++++++++------ llvm/test/Transforms/GVN/vscale.ll | 24 +++++++++---------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp index 1e0ae280516410..7a61ab74166389 100644 --- a/llvm/lib/Transforms/Utils/VNCoercion.cpp +++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp @@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, if (StoredTy == LoadTy) return true; + if (isa(StoredTy) && isa(LoadTy) && + DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy)) + return true; + // If the loaded/stored value is a first class array/struct, or scalable type, // don't try to transform them. We need to be able to bitcast to integer. if (isFirstClassAggregateOrScalableType(LoadTy) || @@ -83,8 +87,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedValue(); - uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedValue(); + TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy); + TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy); // If the store and reload are the same size, we can always reuse it. if (StoredValSize == LoadedValSize) { @@ -118,7 +122,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, // If the loaded value is smaller than the available value, then we can // extract out a piece from it. If the available value is too small, then we // can't do anything. - assert(StoredValSize >= LoadedValSize && + assert(!StoredValSize.isScalable() && + TypeSize::isKnownGE(StoredValSize, LoadedValSize) && "canCoerceMustAliasedValueToLoad fail"); // Convert source pointers to integers, which can be manipulated. @@ -303,6 +308,13 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset, return SrcVal; } + // Return scalable values directly to avoid needing to bitcast to integer + // types, as we do not support non-zero Offsets. + if (isa(LoadTy)) { + assert(Offset == 0 && "Expected a zero offset for scalable types"); + return SrcVal; + } + uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8; uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8; @@ -333,11 +345,15 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset, Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, const DataLayout &DL) { - #ifndef NDEBUG - unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()).getFixedValue(); - unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedValue(); - assert(Offset + LoadSize <= SrcValSize); + TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType()); + TypeSize LoadSize = DL.getTypeStoreSize(LoadTy); + assert(SrcValSize.isScalable() == LoadSize.isScalable()); + assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) && + "Expected Offset + LoadSize <= SrcValSize"); + assert( + (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) && + "Expected scalable type sizes to match"); #endif IRBuilder<> Builder(InsertPt); SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL); diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll index f6e0f8c1a64944..67cbfc2f05ef84 100644 --- a/llvm/test/Transforms/GVN/vscale.ll +++ b/llvm/test/Transforms/GVN/vscale.ll @@ -393,7 +393,7 @@ if.else: define @load_v16i8_store_v4i32_forward_load(ptr %p, %x) { ; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load( ; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to ; CHECK-NEXT: ret [[LOAD]] ; store %x, ptr %p @@ -404,7 +404,7 @@ define @load_v16i8_store_v4i32_forward_load(ptr %p, @load_v4f32_store_v4i32_forward_load(ptr %p, %x) { ; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load( ; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to ; CHECK-NEXT: ret [[LOAD]] ; store %x, ptr %p @@ -415,7 +415,7 @@ define @load_v4f32_store_v4i32_forward_load(ptr %p, @load_v4f32_store_v16i8_forward_load(ptr %p, %x) { ; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load( ; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to ; CHECK-NEXT: ret [[LOAD]] ; store %x, ptr %p @@ -426,7 +426,7 @@ define @load_v4f32_store_v16i8_forward_load(ptr %p, @load_v4i32_store_v4f32_forward_load(ptr %p, %x) { ; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load( ; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to ; CHECK-NEXT: ret [[LOAD]] ; store %x, ptr %p @@ -496,7 +496,8 @@ define @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, < define @load_v2p0_store_v4i32_forward_load(ptr %p, %x) { ; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load( ; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[X]] to +; CHECK-NEXT: [[LOAD:%.*]] = inttoptr [[TMP1]] to ; CHECK-NEXT: ret [[LOAD]] ; store %x, ptr %p @@ -507,7 +508,7 @@ define @load_v2p0_store_v4i32_forward_load(ptr %p, @load_v2i64_store_v2p0_forward_load(ptr %p, %x) { ; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load( ; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = ptrtoint [[X]] to ; CHECK-NEXT: ret [[LOAD]] ; store %x, ptr %p @@ -540,8 +541,7 @@ define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, @load_v16i8_store_v4i32_forward_constant(ptr %p) { ; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant( ; CHECK-NEXT: store splat (i32 4), ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 -; CHECK-NEXT: ret [[LOAD]] +; CHECK-NEXT: ret bitcast ( splat (i32 4) to ) ; store splat (i32 4), ptr %p %load = load , ptr %p @@ -590,13 +590,13 @@ define { , , , , , , } [[A]], 3 ; CHECK-NEXT: store [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 -; CHECK-NEXT: [[DOTUNPACK:%.*]] = load , ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[DOTUNPACK:%.*]] = bitcast [[A_ELT]] to ; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[DOTUNPACK]], 0 -; CHECK-NEXT: [[DOTUNPACK8:%.*]] = load , ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[DOTUNPACK8:%.*]] = bitcast [[A_ELT2]] to ; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[DOTUNPACK8]], 1 -; CHECK-NEXT: [[DOTUNPACK10:%.*]] = load , ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[DOTUNPACK10:%.*]] = bitcast [[A_ELT4]] to ; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[DOTUNPACK10]], 2 -; CHECK-NEXT: [[DOTUNPACK12:%.*]] = load , ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[DOTUNPACK12:%.*]] = bitcast [[A_ELT6]] to ; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[DOTUNPACK12]], 3 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) ; CHECK-NEXT: ret { , , , } [[TMP15]] From c9b7303b9b18129c4ee6b56aaa2a0a9f59be2d09 Mon Sep 17 00:00:00 2001 From: Karlo Basioli <68535415+basioli-k@users.noreply.github.com> Date: Thu, 23 Jan 2025 18:52:53 +0000 Subject: [PATCH 199/208] Add [[maybe_unused]] to a variable used only in assert in VPlan.h (#124173) --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 42b35e8b57c07d..16c64f32ab6349 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2461,7 +2461,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef({Op0, Op1}), ReductionInst), Opcode(Opcode) { - auto *AccumulatorRecipe = getOperand(1)->getDefiningRecipe(); + [[maybe_unused]] auto *AccumulatorRecipe = + getOperand(1)->getDefiningRecipe(); assert((isa(AccumulatorRecipe) || isa(AccumulatorRecipe)) && "Unexpected operand order for partial reduction recipe"); From c3ecbe6792bbbda12b5a70273048643496b63484 Mon Sep 17 00:00:00 2001 From: mingmingl Date: Thu, 23 Jan 2025 11:07:08 -0800 Subject: [PATCH 200/208] Disable the test again. * https://lab.llvm.org/buildbot/#/builders/127/builds/2148/steps/7/logs/stdio shows a failure. --- llvm/test/CodeGen/X86/jump-table-partition.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/CodeGen/X86/jump-table-partition.ll b/llvm/test/CodeGen/X86/jump-table-partition.ll index 2fd5d523a10cec..bf34c3bb2bf308 100644 --- a/llvm/test/CodeGen/X86/jump-table-partition.ll +++ b/llvm/test/CodeGen/X86/jump-table-partition.ll @@ -1,6 +1,9 @@ ; -stats requires asserts ; requires: asserts +; COM: Investigate test failure and re-enable the test. +; UNSUPPORTED: target={{.*}} + ; Stop after 'finalize-isel' for simpler MIR, and lower the minimum number of ; jump table entries so 'switch' needs fewer cases to generate a jump table. ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -stop-after=finalize-isel -min-jump-table-entries=2 %s -o %t.mir From 4018317407006b2c632fbb75729de624a2426439 Mon Sep 17 00:00:00 2001 From: "Oleksandr T." Date: Thu, 23 Jan 2025 21:16:59 +0200 Subject: [PATCH 201/208] [Clang] restrict use of attribute names reserved by the C++ standard (#106036) Fixes #92196 https://eel.is/c++draft/macro.names#2 > A translation unit shall not #define or #undef names lexically identical to keywords, to the identifiers listed in Table [4](https://eel.is/c++draft/lex.name#tab:lex.name.special), or to the [attribute-token](https://eel.is/c++draft/dcl.attr.grammar#nt:attribute-token)s described in [[dcl.attr]](https://eel.is/c++draft/dcl.attr), except that the names likely and unlikely may be defined as function-like macros ([[cpp.replace]](https://eel.is/c++draft/cpp.replace))[.](https://eel.is/c++draft/macro.names#2.sentence-1) --- clang/docs/ReleaseNotes.rst | 2 + .../include/clang/Basic/AttributeCommonInfo.h | 9 +- clang/include/clang/Basic/Attributes.h | 5 + clang/include/clang/Basic/CMakeLists.txt | 11 ++ clang/include/clang/Basic/DiagnosticGroups.td | 4 +- .../include/clang/Basic/DiagnosticLexKinds.td | 3 + clang/include/clang/Lex/Preprocessor.h | 10 +- clang/include/clang/Sema/CMakeLists.txt | 5 - clang/lib/Basic/Attributes.cpp | 31 +++++- clang/lib/Lex/PPDirectives.cpp | 28 ++++- .../macro-reserved-attrs-cxx11.cpp | 100 ++++++++++++++++++ clang/utils/TableGen/ClangAttrEmitter.cpp | 30 ++++++ clang/utils/TableGen/TableGen.cpp | 6 ++ clang/utils/TableGen/TableGenBackends.h | 2 + .../gn/secondary/clang/lib/Basic/BUILD.gn | 5 +- .../gn/secondary/clang/lib/Sema/BUILD.gn | 2 +- .../llvm-project-overlay/clang/BUILD.bazel | 9 +- 17 files changed, 234 insertions(+), 28 deletions(-) create mode 100644 clang/test/Preprocessor/macro-reserved-attrs-cxx11.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5d4b182f29afa0..bd74abeb5dd68a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -807,6 +807,8 @@ Improvements to Clang's diagnostics - Clang now emits a ``-Wignored-qualifiers`` diagnostic when a base class includes cv-qualifiers (#GH55474). +- Clang now diagnoses the use of attribute names reserved by the C++ standard (#GH92196). + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h index 11c64547721739..4af5a8fd1852cf 100644 --- a/clang/include/clang/Basic/AttributeCommonInfo.h +++ b/clang/include/clang/Basic/AttributeCommonInfo.h @@ -61,13 +61,18 @@ class AttributeCommonInfo { }; enum Kind { #define PARSED_ATTR(NAME) AT_##NAME, -#include "clang/Sema/AttrParsedAttrList.inc" +#include "clang/Basic/AttrParsedAttrList.inc" #undef PARSED_ATTR NoSemaHandlerAttribute, IgnoredAttribute, UnknownAttribute, }; enum class Scope { NONE, CLANG, GNU, MSVC, OMP, HLSL, GSL, RISCV }; + enum class AttrArgsInfo { + None, + Optional, + Required, + }; private: const IdentifierInfo *AttrName = nullptr; @@ -241,6 +246,8 @@ class AttributeCommonInfo { static Kind getParsedKind(const IdentifierInfo *Name, const IdentifierInfo *Scope, Syntax SyntaxUsed); + static AttrArgsInfo getCXX11AttrArgsInfo(const IdentifierInfo *Name); + private: /// Get an index into the attribute spelling list /// defined in Attr.td. This index is used by an attribute diff --git a/clang/include/clang/Basic/Attributes.h b/clang/include/clang/Basic/Attributes.h index 61666a6f4d9ac4..99bb668fe32d00 100644 --- a/clang/include/clang/Basic/Attributes.h +++ b/clang/include/clang/Basic/Attributes.h @@ -23,6 +23,11 @@ int hasAttribute(AttributeCommonInfo::Syntax Syntax, const IdentifierInfo *Scope, const IdentifierInfo *Attr, const TargetInfo &Target, const LangOptions &LangOpts); +int hasAttribute(AttributeCommonInfo::Syntax Syntax, + const IdentifierInfo *Scope, const IdentifierInfo *Attr, + const TargetInfo &Target, const LangOptions &LangOpts, + bool CheckPlugins); + } // end namespace clang #endif // LLVM_CLANG_BASIC_ATTRIBUTES_H diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index 56c27bacdb20b8..4103d2753abc5f 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -36,6 +36,11 @@ clang_tablegen(AttrList.inc -gen-clang-attr-list SOURCE Attr.td TARGET ClangAttrList) +clang_tablegen(AttrParsedAttrList.inc -gen-clang-attr-parsed-attr-list + -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ + SOURCE Attr.td + TARGET ClangAttrParsedAttrList) + clang_tablegen(AttrSubMatchRulesList.inc -gen-clang-attr-subject-match-rule-list -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ SOURCE Attr.td @@ -53,6 +58,12 @@ clang_tablegen(AttrHasAttributeImpl.inc -gen-clang-attr-has-attribute-impl TARGET ClangAttrHasAttributeImpl ) +clang_tablegen(CXX11AttributeInfo.inc -gen-cxx11-attribute-info + -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ + SOURCE Attr.td + TARGET CXX11AttributeInfo + ) + clang_tablegen(Builtins.inc -gen-clang-builtins SOURCE Builtins.td TARGET ClangBuiltins) diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index b0ad76026fdb35..209792f851b6ae 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -760,6 +760,7 @@ def AmbiguousMacro : DiagGroup<"ambiguous-macro">; def KeywordAsMacro : DiagGroup<"keyword-macro">; def ReservedIdAsMacro : DiagGroup<"reserved-macro-identifier">; def ReservedIdAsMacroAlias : DiagGroup<"reserved-id-macro", [ReservedIdAsMacro]>; +def ReservedAttributeIdentifier : DiagGroup<"reserved-attribute-identifier">; def RestrictExpansionMacro : DiagGroup<"restrict-expansion">; def FinalMacro : DiagGroup<"final-macro">; @@ -935,7 +936,8 @@ def SignedEnumBitfield : DiagGroup<"signed-enum-bitfield">; def ReservedModuleIdentifier : DiagGroup<"reserved-module-identifier">; def ReservedIdentifier : DiagGroup<"reserved-identifier", - [ReservedIdAsMacro, ReservedModuleIdentifier, UserDefinedLiterals]>; + [ReservedIdAsMacro, ReservedModuleIdentifier, + UserDefinedLiterals, ReservedAttributeIdentifier]>; // Unreachable code warning groups. // diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 959376b0847216..4bcef23ccce169 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -407,6 +407,9 @@ def warn_pp_macro_hides_keyword : Extension< def warn_pp_macro_is_reserved_id : Warning< "macro name is a reserved identifier">, DefaultIgnore, InGroup; +def warn_pp_macro_is_reserved_attribute_id : Warning< + "%0 is a reserved attribute identifier">, DefaultIgnore, + InGroup; def warn_pp_objc_macro_redef_ignored : Warning< "ignoring redefinition of Objective-C qualifier macro">, InGroup>; diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 3d223c345ea156..8ddc5b56eedbd4 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -2271,6 +2271,11 @@ class Preprocessor { } } + /// Determine whether the next preprocessor token to be + /// lexed is a '('. If so, consume the token and return true, if not, this + /// method should have no observable side-effect on the lexed tokens. + bool isNextPPTokenLParen(); + private: /// Identifiers used for SEH handling in Borland. These are only /// allowed in particular circumstances @@ -2648,11 +2653,6 @@ class Preprocessor { void removeCachedMacroExpandedTokensOfLastLexer(); - /// Determine whether the next preprocessor token to be - /// lexed is a '('. If so, consume the token and return true, if not, this - /// method should have no observable side-effect on the lexed tokens. - bool isNextPPTokenLParen(); - /// After reading "MACRO(", this method is invoked to read all of the formal /// arguments specified for the macro invocation. Returns null on error. MacroArgs *ReadMacroCallArgumentList(Token &MacroName, MacroInfo *MI, diff --git a/clang/include/clang/Sema/CMakeLists.txt b/clang/include/clang/Sema/CMakeLists.txt index 0b0e31ece3195d..9077e22c2307cd 100644 --- a/clang/include/clang/Sema/CMakeLists.txt +++ b/clang/include/clang/Sema/CMakeLists.txt @@ -3,11 +3,6 @@ clang_tablegen(AttrTemplateInstantiate.inc -gen-clang-attr-template-instantiate SOURCE ../Basic/Attr.td TARGET ClangAttrTemplateInstantiate) -clang_tablegen(AttrParsedAttrList.inc -gen-clang-attr-parsed-attr-list - -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ - SOURCE ../Basic/Attr.td - TARGET ClangAttrParsedAttrList) - clang_tablegen(AttrParsedAttrKinds.inc -gen-clang-attr-parsed-attr-kinds -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ SOURCE ../Basic/Attr.td diff --git a/clang/lib/Basic/Attributes.cpp b/clang/lib/Basic/Attributes.cpp index fa26cc584b724a..2035d4c0a5768b 100644 --- a/clang/lib/Basic/Attributes.cpp +++ b/clang/lib/Basic/Attributes.cpp @@ -33,7 +33,8 @@ static int hasAttributeImpl(AttributeCommonInfo::Syntax Syntax, StringRef Name, int clang::hasAttribute(AttributeCommonInfo::Syntax Syntax, const IdentifierInfo *Scope, const IdentifierInfo *Attr, - const TargetInfo &Target, const LangOptions &LangOpts) { + const TargetInfo &Target, const LangOptions &LangOpts, + bool CheckPlugins) { StringRef Name = Attr->getName(); // Normalize the attribute name, __foo__ becomes foo. if (Name.size() >= 4 && Name.starts_with("__") && Name.ends_with("__")) @@ -61,14 +62,23 @@ int clang::hasAttribute(AttributeCommonInfo::Syntax Syntax, if (res) return res; - // Check if any plugin provides this attribute. - for (auto &Ptr : getAttributePluginInstances()) - if (Ptr->hasSpelling(Syntax, Name)) - return 1; + if (CheckPlugins) { + // Check if any plugin provides this attribute. + for (auto &Ptr : getAttributePluginInstances()) + if (Ptr->hasSpelling(Syntax, Name)) + return 1; + } return 0; } +int clang::hasAttribute(AttributeCommonInfo::Syntax Syntax, + const IdentifierInfo *Scope, const IdentifierInfo *Attr, + const TargetInfo &Target, const LangOptions &LangOpts) { + return hasAttribute(Syntax, Scope, Attr, Target, LangOpts, + /*CheckPlugins=*/true); +} + const char *attr::getSubjectMatchRuleSpelling(attr::SubjectMatchRule Rule) { switch (Rule) { #define ATTR_MATCH_RULE(NAME, SPELLING, IsAbstract) \ @@ -151,6 +161,17 @@ AttributeCommonInfo::getParsedKind(const IdentifierInfo *Name, return ::getAttrKind(normalizeName(Name, ScopeName, SyntaxUsed), SyntaxUsed); } +AttributeCommonInfo::AttrArgsInfo +AttributeCommonInfo::getCXX11AttrArgsInfo(const IdentifierInfo *Name) { + StringRef AttrName = + normalizeAttrName(Name, /*NormalizedScopeName*/ "", Syntax::AS_CXX11); +#define CXX11_ATTR_ARGS_INFO + return llvm::StringSwitch(AttrName) +#include "clang/Basic/CXX11AttributeInfo.inc" + .Default(AttributeCommonInfo::AttrArgsInfo::None); +#undef CXX11_ATTR_ARGS_INFO +} + std::string AttributeCommonInfo::getNormalizedFullName() const { return static_cast( normalizeName(getAttrName(), getScopeName(), getSyntax())); diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index a23ad40884f249..a29b73f97ab7e3 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -11,6 +11,8 @@ /// //===----------------------------------------------------------------------===// +#include "clang/Basic/AttributeCommonInfo.h" +#include "clang/Basic/Attributes.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/DirectoryEntry.h" #include "clang/Basic/FileManager.h" @@ -97,7 +99,8 @@ SourceRange Preprocessor::DiscardUntilEndOfDirective(Token &Tmp) { enum MacroDiag { MD_NoWarn, //> Not a reserved identifier MD_KeywordDef, //> Macro hides keyword, enabled by default - MD_ReservedMacro //> #define of #undef reserved id, disabled by default + MD_ReservedMacro, //> #define of #undef reserved id, disabled by default + MD_ReservedAttributeIdentifier }; /// Enumerates possible %select values for the pp_err_elif_after_else and @@ -173,6 +176,22 @@ static bool isLanguageDefinedBuiltin(const SourceManager &SourceMgr, return false; } +static bool isReservedCXXAttributeName(Preprocessor &PP, IdentifierInfo *II) { + const LangOptions &Lang = PP.getLangOpts(); + if (Lang.CPlusPlus && + hasAttribute(AttributeCommonInfo::AS_CXX11, /* Scope*/ nullptr, II, + PP.getTargetInfo(), Lang, /*CheckPlugins*/ false) > 0) { + AttributeCommonInfo::AttrArgsInfo AttrArgsInfo = + AttributeCommonInfo::getCXX11AttrArgsInfo(II); + if (AttrArgsInfo == AttributeCommonInfo::AttrArgsInfo::Required) + return PP.isNextPPTokenLParen(); + + return !PP.isNextPPTokenLParen() || + AttrArgsInfo == AttributeCommonInfo::AttrArgsInfo::Optional; + } + return false; +} + static MacroDiag shouldWarnOnMacroDef(Preprocessor &PP, IdentifierInfo *II) { const LangOptions &Lang = PP.getLangOpts(); StringRef Text = II->getName(); @@ -182,6 +201,8 @@ static MacroDiag shouldWarnOnMacroDef(Preprocessor &PP, IdentifierInfo *II) { return MD_KeywordDef; if (Lang.CPlusPlus11 && (Text == "override" || Text == "final")) return MD_KeywordDef; + if (isReservedCXXAttributeName(PP, II)) + return MD_ReservedAttributeIdentifier; return MD_NoWarn; } @@ -190,6 +211,8 @@ static MacroDiag shouldWarnOnMacroUndef(Preprocessor &PP, IdentifierInfo *II) { // Do not warn on keyword undef. It is generally harmless and widely used. if (isReservedInAllContexts(II->isReserved(Lang))) return MD_ReservedMacro; + if (isReservedCXXAttributeName(PP, II)) + return MD_ReservedAttributeIdentifier; return MD_NoWarn; } @@ -365,6 +388,9 @@ bool Preprocessor::CheckMacroName(Token &MacroNameTok, MacroUse isDefineUndef, } if (D == MD_ReservedMacro) Diag(MacroNameTok, diag::warn_pp_macro_is_reserved_id); + if (D == MD_ReservedAttributeIdentifier) + Diag(MacroNameTok, diag::warn_pp_macro_is_reserved_attribute_id) + << II->getName(); } // Okay, we got a good identifier. diff --git a/clang/test/Preprocessor/macro-reserved-attrs-cxx11.cpp b/clang/test/Preprocessor/macro-reserved-attrs-cxx11.cpp new file mode 100644 index 00000000000000..ab48f1b46df989 --- /dev/null +++ b/clang/test/Preprocessor/macro-reserved-attrs-cxx11.cpp @@ -0,0 +1,100 @@ +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -Wreserved-attribute-identifier -fsyntax-only -verify %s -DTEST1 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -Wreserved-attribute-identifier -fsyntax-only -verify %s -DTEST2 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -Wreserved-attribute-identifier -fsyntax-only -verify %s -DTEST3 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -Wreserved-attribute-identifier -fsyntax-only -verify %s -DTEST4 + +#ifdef TEST1 + +#define assume +#undef assume + +#define noreturn // expected-warning {{noreturn is a reserved attribute identifier}} +#undef noreturn // expected-warning {{noreturn is a reserved attribute identifier}} + +#define carries_dependency // expected-warning {{carries_dependency is a reserved attribute identifier}} +#undef carries_dependency // expected-warning {{carries_dependency is a reserved attribute identifier}} + +#define deprecated // expected-warning {{deprecated is a reserved attribute identifier}} +#undef deprecated // expected-warning {{deprecated is a reserved attribute identifier}} + +#define fallthrough // expected-warning {{fallthrough is a reserved attribute identifier}} +#undef fallthrough // expected-warning {{fallthrough is a reserved attribute identifier}} + +#define likely // expected-warning {{likely is a reserved attribute identifier}} +#undef likely // expected-warning {{likely is a reserved attribute identifier}} + +#define no_unique_address // expected-warning {{no_unique_address is a reserved attribute identifier}} +#undef no_unique_address // expected-warning {{no_unique_address is a reserved attribute identifier}} + +#define unlikely // expected-warning {{unlikely is a reserved attribute identifier}} +#undef unlikely // expected-warning {{unlikely is a reserved attribute identifier}} + +#define maybe_unused // expected-warning {{maybe_unused is a reserved attribute identifier}} +#undef maybe_unused // expected-warning {{maybe_unused is a reserved attribute identifier}} + +#define nodiscard // expected-warning {{nodiscard is a reserved attribute identifier}} +#undef nodiscard // expected-warning {{nodiscard is a reserved attribute identifier}} + +#elif TEST2 + +#define assume "test" +#undef assume + +#define noreturn "test" // expected-warning {{noreturn is a reserved attribute identifier}} +#undef noreturn // expected-warning {{noreturn is a reserved attribute identifier}} + +#define carries_dependency "test" // expected-warning {{carries_dependency is a reserved attribute identifier}} +#undef carries_dependency // expected-warning {{carries_dependency is a reserved attribute identifier}} + +#define deprecated "test" // expected-warning {{deprecated is a reserved attribute identifier}} +#undef deprecated // expected-warning {{deprecated is a reserved attribute identifier}} + +#define fallthrough "test" // expected-warning {{fallthrough is a reserved attribute identifier}} +#undef fallthrough // expected-warning {{fallthrough is a reserved attribute identifier}} + +#define likely "test" // expected-warning {{likely is a reserved attribute identifier}} +#undef likely // expected-warning {{likely is a reserved attribute identifier}} + +#define no_unique_address "test" // expected-warning {{no_unique_address is a reserved attribute identifier}} +#undef no_unique_address // expected-warning {{no_unique_address is a reserved attribute identifier}} + +#define unlikely "test" // expected-warning {{unlikely is a reserved attribute identifier}} +#undef unlikely // expected-warning {{unlikely is a reserved attribute identifier}} + +#define maybe_unused "test" // expected-warning {{maybe_unused is a reserved attribute identifier}} +#undef maybe_unused // expected-warning {{maybe_unused is a reserved attribute identifier}} + +#define nodiscard "test" // expected-warning {{nodiscard is a reserved attribute identifier}} +#undef nodiscard // expected-warning {{nodiscard is a reserved attribute identifier}} + +#elif TEST3 + +#define assume() "test" // expected-warning {{assume is a reserved attribute identifier}} +#define deprecated() "test" // expected-warning {{deprecated is a reserved attribute identifier}} +#define nodiscard() "test" // expected-warning {{nodiscard is a reserved attribute identifier}} +#define noreturn() "test" +#define carries_dependency() "test" +#define fallthrough() "test" +#define likely() "test" +#define no_unique_address() "test" +#define unlikely() "test" +#define maybe_unused() "test" + +#elif TEST4 + +#define assume() // expected-warning {{assume is a reserved attribute identifier}} +#define deprecated() // expected-warning {{deprecated is a reserved attribute identifier}} +#define nodiscard() // expected-warning {{nodiscard is a reserved attribute identifier}} +#define noreturn() +#define carries_dependency() +#define fallthrough() +#define likely() +#define no_unique_address() +#define unlikely() +#define maybe_unused() + +#else + +#error Unknown test + +#endif diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index cc6a8eaebd44ec..de12c7062666a4 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -3743,6 +3743,36 @@ void EmitClangRegularKeywordAttributeInfo(const RecordKeeper &Records, OS << "#undef KEYWORD_ATTRIBUTE\n"; } +void EmitCXX11AttributeInfo(const RecordKeeper &Records, raw_ostream &OS) { + OS << "#if defined(CXX11_ATTR_ARGS_INFO)\n"; + for (auto *R : Records.getAllDerivedDefinitions("Attr")) { + for (const FlattenedSpelling &SI : GetFlattenedSpellings(*R)) { + if (SI.variety() == "CXX11" && SI.nameSpace().empty()) { + unsigned RequiredArgs = 0; + unsigned OptionalArgs = 0; + for (const auto *Arg : R->getValueAsListOfDefs("Args")) { + if (Arg->getValueAsBit("Fake")) + continue; + + if (Arg->getValueAsBit("Optional")) + OptionalArgs++; + else + RequiredArgs++; + } + OS << ".Case(\"" << SI.getSpellingRecord().getValueAsString("Name") + << "\"," + << "AttributeCommonInfo::AttrArgsInfo::" + << (RequiredArgs ? "Required" + : OptionalArgs ? "Optional" + : "None") + << ")" + << "\n"; + } + } + } + OS << "#endif // CXX11_ATTR_ARGS_INFO\n"; +} + // Emits the list of spellings for attributes. void EmitClangAttrHasAttrImpl(const RecordKeeper &Records, raw_ostream &OS) { emitSourceFileHeader("Code to implement the __has_attribute logic", OS, diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 8b8eadbe7f7e54..569d7a6a3ac8b7 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -69,6 +69,7 @@ enum ActionType { GenClangOpenCLBuiltins, GenClangOpenCLBuiltinHeader, GenClangOpenCLBuiltinTests, + GenCXX11AttributeInfo, GenArmNeon, GenArmFP16, GenArmBF16, @@ -228,6 +229,8 @@ cl::opt Action( "Generate OpenCL builtin header"), clEnumValN(GenClangOpenCLBuiltinTests, "gen-clang-opencl-builtin-tests", "Generate OpenCL builtin declaration tests"), + clEnumValN(GenCXX11AttributeInfo, "gen-cxx11-attribute-info", + "Generate CXX11 attributes info"), clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"), clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"), clEnumValN(GenArmBF16, "gen-arm-bf16", "Generate arm_bf16.h for clang"), @@ -336,6 +339,9 @@ bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) { case GenClangAttrSubjectMatchRulesParserStringSwitches: EmitClangAttrSubjectMatchRulesParserStringSwitches(Records, OS); break; + case GenCXX11AttributeInfo: + EmitCXX11AttributeInfo(Records, OS); + break; case GenClangAttrImpl: EmitClangAttrImpl(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 0448c94de08e3d..03ed3dad93631b 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -49,6 +49,8 @@ void EmitClangAttrParserStringSwitches(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrSubjectMatchRulesParserStringSwitches( const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitCXX11AttributeInfo(const llvm::RecordKeeper &Records, + llvm::raw_ostream &OS); void EmitClangAttrClass(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrImpl(const llvm::RecordKeeper &Records, diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index d759ff4429a922..99c86facde83c6 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -22,6 +22,7 @@ static_library("Basic") { public_deps = [ # public_dep because public header Version.h includes generated Version.inc. "//clang/include/clang/Basic:AttrList", + "//clang/include/clang/Basic:AttrParsedAttrList", "//clang/include/clang/Basic:AttrSubMatchRulesList", "//clang/include/clang/Basic:Builtins", "//clang/include/clang/Basic:BuiltinsBPF", @@ -42,10 +43,6 @@ static_library("Basic") { "//clang/include/clang/Basic:riscv_vector_builtins", "//clang/include/clang/Basic:version", - # public_dep because public header AttributeCommonInfo.h includes generated - # AttrParsedAttrList.inc. - "//clang/include/clang/Sema:AttrParsedAttrList", - # public_dep because public header OpenMPKinds.h includes generated # OMP.h.inc. "//llvm/include/llvm/Frontend/OpenMP:public_tablegen", diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn index 3b72177ee5d7c3..306ef0adda7088 100644 --- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn @@ -9,6 +9,7 @@ static_library("Sema") { configs += [ "//llvm/utils/gn/build:clang_code" ] deps = [ ":OpenCLBuiltins", + "//clang/include/clang/Basic:AttrParsedAttrList", "//clang/include/clang/Basic:arm_cde_builtin_aliases", "//clang/include/clang/Basic:arm_cde_builtin_sema", "//clang/include/clang/Basic:arm_mve_builtin_aliases", @@ -22,7 +23,6 @@ static_library("Sema") { "//clang/include/clang/Basic:riscv_vector_builtin_sema", "//clang/include/clang/Sema:AttrParsedAttrImpl", "//clang/include/clang/Sema:AttrParsedAttrKinds", - "//clang/include/clang/Sema:AttrParsedAttrList", "//clang/include/clang/Sema:AttrSpellingListIndex", "//clang/include/clang/Sema:AttrTemplateInstantiate", "//clang/lib/APINotes", diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index e3f4fab2c3fdb9..cd452cb6898db5 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -572,6 +572,10 @@ gentbl( "-gen-clang-attr-list", "include/clang/Basic/AttrList.inc", ), + ( + "-gen-clang-attr-parsed-attr-list", + "include/clang/Basic/AttrParsedAttrList.inc", + ), ( "-gen-clang-attr-subject-match-rule-list", "include/clang/Basic/AttrSubMatchRulesList.inc", @@ -1135,10 +1139,6 @@ gentbl( "-gen-clang-attr-parsed-attr-kinds", "include/clang/Sema/AttrParsedAttrKinds.inc", ), - ( - "-gen-clang-attr-parsed-attr-list", - "include/clang/Sema/AttrParsedAttrList.inc", - ), ( "-gen-clang-attr-spelling-index", "include/clang/Sema/AttrSpellingListIndex.inc", @@ -1174,7 +1174,6 @@ cc_library( textual_hdrs = [ "include/clang/Sema/AttrParsedAttrImpl.inc", "include/clang/Sema/AttrParsedAttrKinds.inc", - "include/clang/Sema/AttrParsedAttrList.inc", "include/clang/Sema/AttrSpellingListIndex.inc", "include/clang/Sema/AttrTemplateInstantiate.inc", "lib/Sema/OpenCLBuiltins.inc", From d17e4ca7e02b6c4118df9170d10c4e68aee7da0a Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Thu, 23 Jan 2025 13:23:51 -0600 Subject: [PATCH 202/208] [bazel][NFC] Add td_library for downstream use (#124156) This will allow td_library/gentbl_cc_library in other packages to use these td files. --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 56dff6b3ad5003..a54d464ac81d3a 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -1267,6 +1267,12 @@ filegroup( ]), ) +td_library( + name = "CommonTargetTdFiles", + srcs = [":common_target_td_sources"], + includes = ["include"], +) + gentbl( name = "ARMTargetParserDefGen", tbl_outs = [("-gen-arm-target-def", "include/llvm/TargetParser/ARMTargetParserDef.inc")], From cb981cc540ba7e16f973e925a80b5bcb337381cc Mon Sep 17 00:00:00 2001 From: Alex Prabhat Bara Date: Fri, 24 Jan 2025 00:55:14 +0530 Subject: [PATCH 203/208] [libc] added btowc to wchar.h generated header (#124168) Fixes: #124152 --- libc/include/wchar.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 27a5926b574554..159237c7b4ff7c 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -15,3 +15,9 @@ functions: return_type: int arguments: - type: wint_t + - name: btowc + standards: + - stdc + return_type: wint_t + arguments: + - type: int From c118864223c6309378cd704f3406533474c2759f Mon Sep 17 00:00:00 2001 From: Yi Qian <68618497+yiqian1@users.noreply.github.com> Date: Thu, 23 Jan 2025 13:27:56 -0600 Subject: [PATCH 204/208] [MLIR][ROCDL]Add MFMA_*_F8F6F4 instructions to the ROCDL dialect (#123830) This PR adds mfma.scale.f32.32x32x64.f8f6f4 and mfma.scale.f32.16x16x128.f8f6f4 to the ROCDL dialect. They are converted to the corresponding intrinsics in the mlir-to-llvmir pass. --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 15 +- mlir/test/Dialect/LLVMIR/rocdl.mlir | 309 ++++++++++++++++++- mlir/test/Target/LLVMIR/rocdl.mlir | 276 +++++++++++++++++ 3 files changed, 598 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index e9e62a74237c4f..95fbe7ed66a434 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -343,6 +343,18 @@ class ROCDL_Mfma_IntrOp traits = []> : "$args attr-dict `:` functional-type($args, $res)"; } +//===---------------------------------------------------------------------===// +// MFMA intrinsics with overloaded operands +class ROCDL_Mfma_OO_IntrOp overloadedOperands, + list traits = []> : + LLVM_IntrOpBase, + Arguments<(ins Variadic:$args)> { + let assemblyFormat = + "$args attr-dict `:` functional-type($args, $res)"; +} + // Available on all CDNA. def ROCDL_mfma_f32_32x32x1f32 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x1f32">; def ROCDL_mfma_f32_16x16x1f32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x1f32">; @@ -394,7 +406,8 @@ def ROCDL_mfma_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x32.f16">; def ROCDL_mfma_f32_32x32x16_bf16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.bf16">; def ROCDL_mfma_i32_32x32x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x32.i8">; def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; - +def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; +def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; //===---------------------------------------------------------------------===// // WMMA intrinsics class ROCDL_Wmma_IntrOp overloadedOperands, diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index c80ebebaafe3ad..712f8c2a1caf66 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -66,7 +66,8 @@ func.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32, %arg8 : vector<16xi32>, %arg9 : vector<4xi32>, %arg10 : vector<2xi16>, %arg11 : vector<4xi16>, %arg12 : vector<4xf64>, %arg13 : f64, - %arg14 : i64, %arg15 : vector<2xf32>) { + %arg14 : i64, %arg15 : vector<2xf32>, + %arg16: vector<8xbf16>, %arg17 : vector<8xf16>) { // CHECK-LABEL: rocdl.xdlops // CHECK: rocdl.mfma.f32.32x32x1f32 {{.*}} : (f32, f32, vector<32xf32>, i32, i32, i32) -> vector<32xf32> %r0 = rocdl.mfma.f32.32x32x1f32 %arg0, %arg1, %arg2, %arg3, %arg3, %arg3 : @@ -224,6 +225,312 @@ func.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32, (vector<2xf32>, vector<2xf32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + // CHECK: rocdl.mfma.f32.16x16x32.bf16 {{.*}} : (vector<8xbf16>, vector<8xbf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + %r31 = rocdl.mfma.f32.16x16x32.bf16 %arg16, %arg16, %arg5, %arg3, %arg3, %arg3 : + (vector<8xbf16>, vector<8xbf16>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: rocdl.mfma.i32.16x16x64.i8 {{.*}} : (vector<4xi32>, vector<4xi32>, vector<4xi32>, i32, i32, i32) -> vector<4xi32> + %r32 = rocdl.mfma.i32.16x16x64.i8 %arg9, %arg9, %arg9, %arg3, %arg3, %arg3 : + (vector<4xi32>, vector<4xi32>, vector<4xi32>, + i32, i32, i32) -> vector<4xi32> + + // CHECK: rocdl.mfma.f32.16x16x32.f16 {{.*}} : (vector<8xf16>, vector<8xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xi32> + %r33 = rocdl.mfma.f32.16x16x32.f16 %arg17, %arg17, %arg5, %arg3, %arg3, %arg3 : + (vector<8xf16>, vector<8xf16>, vector<4xf32>, + i32, i32, i32) -> vector<4xi32> + + // CHECK: rocdl.mfma.f32.32x32x16.bf16 {{.*}} : (vector<8xbf16>, vector<8xbf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r34 = rocdl.mfma.f32.32x32x16.bf16 %arg16, %arg16, %arg4, %arg3, %arg3, %arg3 : + (vector<8xbf16>, vector<8xbf16>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: rocdl.mfma.i32.32x32x32.i8 {{.*}} : (vector<4xi32>, vector<4xi32>, vector<16xi32>, i32, i32, i32) -> vector<16xi32> + %r35 = rocdl.mfma.i32.32x32x32.i8 %arg9, %arg9, %arg8, %arg3, %arg3, %arg3 : + (vector<4xi32>, vector<4xi32>, vector<16xi32>, + i32, i32, i32) -> vector<16xi32> + + // CHECK: rocdl.mfma.f32.32x32x16.f16 {{.*}} : (vector<8xf16>, vector<8xf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r36 = rocdl.mfma.f32.32x32x16.f16 %arg17, %arg17, %arg4, %arg3, %arg3, %arg3 : + (vector<8xf16>, vector<8xf16>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + llvm.return +} + +llvm.func @rocdl.mfma.scale.f32.32x32x64.f8f6f4(%arg0 : i32, + %arg1 : vector<16 x f32>, %arg2 : vector<8xi32>, + %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) { + %cst0 = llvm.mlir.constant(0 : i32) : i32 + %cst1 = llvm.mlir.constant(1 : i32) : i32 + %cst2 = llvm.mlir.constant(2 : i32) : i32 + %cst3 = llvm.mlir.constant(3 : i32) : i32 + %cst4 = llvm.mlir.constant(4 : i32) : i32 + + // CHECK-LABEL: rocdl.mfma.scale.f32.32x32x64.f8f6f4 + // fp8 * fp8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r00 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * bf8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r01 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * fp6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r02 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * bf6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r03 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * fp4 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r04 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg4, %arg1, %cst0, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * fp8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r10 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * bf8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r11 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * fp6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r12 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * bf6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r13 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * fp4 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r14 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg4, %arg1, %cst1, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * fp8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r20 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * bf8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r21 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * fp6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r22 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * bf6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r23 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * fp4 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r24 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg4, %arg1, %cst2, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * fp8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r30 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * bf8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r31 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * fp6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r32 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * bf6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r33 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * fp4 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r34 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg4, %arg1, %cst3, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * fp8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r40 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * bf8 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r41 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * fp6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r42 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * bf6 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r43 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * fp4 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4 {{.*}} : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + %r44 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg4, %arg1, %cst4, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + llvm.return +} + +llvm.func @rocdl.mfma.scale.f32.16x16x128.f8f6f4(%arg0 : i32, + %arg1 : vector<4 x f32>, %arg2 : vector<8xi32>, + %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) { + %cst0 = llvm.mlir.constant(0 : i32) : i32 + %cst1 = llvm.mlir.constant(1 : i32) : i32 + %cst2 = llvm.mlir.constant(2 : i32) : i32 + %cst3 = llvm.mlir.constant(3 : i32) : i32 + %cst4 = llvm.mlir.constant(4 : i32) : i32 + + // CHECK-LABEL: rocdl.mfma.scale.f32.16x16x128.f8f6f4 + // fp8 * fp8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r00 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * bf8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r01 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * fp6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r02 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * bf6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r03 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * fp4 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r04 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg4, %arg1, %cst0, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * fp8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r10 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * bf8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r11 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * fp6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r12 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * bf6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r13 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * fp4 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r14 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg4, %arg1, %cst1, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * fp8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r20 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * bf8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r21 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * fp6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r22 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * bf6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r23 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * fp4 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r24 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg4, %arg1, %cst2, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * fp8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r30 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * bf8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r31 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * fp6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r32 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * bf6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r33 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * fp4 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r34 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg4, %arg1, %cst3, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * fp8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r40 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * bf8 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r41 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * fp6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r42 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * bf6 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r43 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * fp4 + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4 {{.*}} : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + %r44 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg4, %arg1, %cst4, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + llvm.return } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 8879ba02b24057..b74edb62106837 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -398,6 +398,282 @@ llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32, llvm.return %r0 : vector<32 x f32> } +llvm.func @rocdl.mfma.scale.f32.32x32x64.f8f6f4(%arg0 : i32, + %arg1 : vector<16 x f32>, %arg2 : vector<8xi32>, + %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) -> vector<16 x f32> { + %cst0 = llvm.mlir.constant(0 : i32) : i32 + %cst1 = llvm.mlir.constant(1 : i32) : i32 + %cst2 = llvm.mlir.constant(2 : i32) : i32 + %cst3 = llvm.mlir.constant(3 : i32) : i32 + %cst4 = llvm.mlir.constant(4 : i32) : i32 + + // CHECK-LABEL: rocdl.mfma.scale.f32.32x32x64.f8f6f4 + // fp8 * fp8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r00 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * bf8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r01 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * fp6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r02 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * bf6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r03 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp8 * fp4 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r04 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg4, %arg1, %cst0, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * fp8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r10 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * bf8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r11 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * fp6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r12 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * bf6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r13 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf8 * fp4 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r14 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg4, %arg1, %cst1, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * fp8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r20 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * bf8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r21 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * fp6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r22 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * bf6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r23 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp6 * fp4 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r24 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg4, %arg1, %cst2, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * fp8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r30 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * bf8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r31 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * fp6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r32 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * bf6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r33 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // bf6 * fp4 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r34 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg4, %arg1, %cst3, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * fp8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r40 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * bf8 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r41 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * fp6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r42 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * bf6 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r43 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + // fp4 * fp4 + // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r44 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg4, %arg1, %cst4, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + + llvm.return %r00 : vector<16 x f32> +} + +llvm.func @rocdl.mfma.scale.f32.16x16x128.f8f6f4(%arg0 : i32, + %arg1 : vector<4 x f32>, %arg2 : vector<8xi32>, + %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) -> vector<4 x f32> { + %cst0 = llvm.mlir.constant(0 : i32) : i32 + %cst1 = llvm.mlir.constant(1 : i32) : i32 + %cst2 = llvm.mlir.constant(2 : i32) : i32 + %cst3 = llvm.mlir.constant(3 : i32) : i32 + %cst4 = llvm.mlir.constant(4 : i32) : i32 + + // CHECK-LABEL: rocdl.mfma.scale.f32.16x16x128.f8f6f4 + // fp8 * fp8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r00 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * bf8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r01 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * fp6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r02 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * bf6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r03 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp8 * fp4 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r04 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg4, %arg1, %cst0, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * fp8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r10 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * bf8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r11 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * fp6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r12 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * bf6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r13 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf8 * fp4 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r14 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg4, %arg1, %cst1, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * fp8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r20 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * bf8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r21 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * fp6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r22 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * bf6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r23 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp6 * fp4 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r24 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg4, %arg1, %cst2, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * fp8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r30 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * bf8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r31 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * fp6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r32 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * bf6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r33 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // bf6 * fp4 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r34 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg4, %arg1, %cst3, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * fp8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r40 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst0, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * bf8 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r41 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst1, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * fp6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r42 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst2, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * bf6 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) + %r43 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst3, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + // fp4 * fp4 + // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}} + %r44 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg4, %arg1, %cst4, %cst4, %cst0, %arg0, %cst0, %arg0 : + (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + + llvm.return %r00 : vector<4 x f32> +} + llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : vector<16 x i16>, %arg3 : vector<8 x i32>, %arg4 : vector<2xi32>, %arg5 : vector<4xi32>, %arg6 : vector<4xf32>, %arg7 : vector<8xf16>, %arg8 : vector<8xi16>) -> vector<8xf32> { %zero = llvm.mlir.constant(false) : i1 From 24b137365004f0916f9ed64bc5859d8b80585ca1 Mon Sep 17 00:00:00 2001 From: Prashanth Date: Fri, 24 Jan 2025 01:00:01 +0530 Subject: [PATCH 205/208] [libc][docs] Add Unistd header's page to the status of implementations doc (#123068) These changes ensure that the unistd header is documented properly with respect to the issue ( https://github.com/llvm/llvm-project/issues/122006 ) . --- libc/docs/CMakeLists.txt | 1 + libc/docs/headers/index.rst | 1 + libc/utils/docgen/unistd.yaml | 685 ++++++++++++++++++++++++++++++++++ 3 files changed, 687 insertions(+) create mode 100644 libc/utils/docgen/unistd.yaml diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt index 04eaa7f1b8a5dc..bb8e3e96e47cac 100644 --- a/libc/docs/CMakeLists.txt +++ b/libc/docs/CMakeLists.txt @@ -62,6 +62,7 @@ if (SPHINX_FOUND) termios threads uchar + unistd wchar wctype ) diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst index 4a66d68ed902d8..d08552d2232522 100644 --- a/libc/docs/headers/index.rst +++ b/libc/docs/headers/index.rst @@ -34,6 +34,7 @@ Implementation Status threads time uchar + unistd wchar wctype .. diff --git a/libc/utils/docgen/unistd.yaml b/libc/utils/docgen/unistd.yaml new file mode 100644 index 00000000000000..5cdb1351322e02 --- /dev/null +++ b/libc/utils/docgen/unistd.yaml @@ -0,0 +1,685 @@ +macros: + _POSIX_VERSION: + in-latest-posix: "" + _POSIX2_VERSION: + in-latest-posix: "" + _POSIX_SUBPROFILE: + in-latest-posix: "" + _XOPEN_VERSION: + in-latest-posix: "" + + _POSIX_ADVISORY_INFO: + in-latest-posix: "" + _POSIX_ASYNCHRONOUS_IO: + in-latest-posix: "" + _POSIX_BARRIERS: + in-latest-posix: "" + _POSIX_CHOWN_RESTRICTED: + in-latest-posix: "" + _POSIX_CLOCK_SELECTION: + in-latest-posix: "" + _POSIX_CPUTIME: + in-latest-posix: "" + _POSIX_DEVICE_CONTROL: + in-latest-posix: "" + _POSIX_FSYNC: + in-latest-posix: "" + _POSIX_IPV6: + in-latest-posix: "" + _POSIX_JOB_CONTROL: + in-latest-posix: "" + _POSIX_MAPPED_FILES: + in-latest-posix: "" + _POSIX_MEMLOCK: + in-latest-posix: "" + _POSIX_MEMLOCK_RANGE: + in-latest-posix: "" + _POSIX_MEMORY_PROTECTION: + in-latest-posix: "" + _POSIX_MESSAGE_PASSING: + in-latest-posix: "" + _POSIX_MONOTONIC_CLOCK: + in-latest-posix: "" + _POSIX_NO_TRUNC: + in-latest-posix: "" + _POSIX_PRIORITIZED_IO: + in-latest-posix: "" + _POSIX_PRIORITY_SCHEDULING: + in-latest-posix: "" + _POSIX_RAW_SOCKETS: + in-latest-posix: "" + _POSIX_READER_WRITER_LOCKS: + in-latest-posix: "" + _POSIX_REALTIME_SIGNALS: + in-latest-posix: "" + _POSIX_REGEXP: + in-latest-posix: "" + _POSIX_SAVED_IDS: + in-latest-posix: "" + _POSIX_SEMAPHORES: + in-latest-posix: "" + _POSIX_SHARED_MEMORY_OBJECTS: + in-latest-posix: "" + _POSIX_SHELL: + in-latest-posix: "" + _POSIX_SPAWN: + in-latest-posix: "" + _POSIX_SPIN_LOCKS: + in-latest-posix: "" + _POSIX_SPORADIC_SERVER: + in-latest-posix: "" + _POSIX_SYNCHRONIZED_IO: + in-latest-posix: "" + _POSIX_THREAD_ATTR_STACKADDR: + in-latest-posix: "" + _POSIX_THREAD_ATTR_STACKSIZE: + in-latest-posix: "" + _POSIX_THREAD_CPUTIME: + in-latest-posix: "" + _POSIX_THREAD_PRIO_INHERIT: + in-latest-posix: "" + _POSIX_THREAD_PRIO_PROTECT: + in-latest-posix: "" + _POSIX_THREAD_PRIORITY_SCHEDULING: + in-latest-posix: "" + _POSIX_THREAD_PROCESS_SHARED: + in-latest-posix: "" + _POSIX_THREAD_ROBUST_PRIO_INHERIT: + in-latest-posix: "" + _POSIX_THREAD_ROBUST_PRIO_PROTECT: + in-latest-posix: "" + _POSIX_THREAD_SAFE_FUNCTIONS: + in-latest-posix: "" + _POSIX_THREAD_SPORADIC_SERVER: + in-latest-posix: "" + _POSIX_THREADS: + in-latest-posix: "" + _POSIX_TIMEOUTS: + in-latest-posix: "" + _POSIX_TIMERS: + in-latest-posix: "" + _POSIX_TYPED_MEMORY_OBJECTS: + in-latest-posix: "" + _POSIX_V7_ILP32_OFF32: + in-latest-posix: "" + _POSIX_V7_ILP32_OFFBIG: + in-latest-posix: "" + _POSIX_V7_LP64_OFF64: + in-latest-posix: "" + _POSIX_V7_LPBIG_OFFBIG: + in-latest-posix: "" + _POSIX_V8_ILP32_OFF32: + in-latest-posix: "" + _POSIX_V8_ILP32_OFFBIG: + in-latest-posix: "" + _POSIX_V8_LP64_OFF64: + in-latest-posix: "" + _POSIX_V8_LPBIG_OFFBIG: + in-latest-posix: "" + + _POSIX2_C_BIND: + in-latest-posix: "" + _POSIX2_C_DEV: + in-latest-posix: "" + _POSIX2_CHAR_TERM: + in-latest-posix: "" + _POSIX2_FORT_RUN: + in-latest-posix: "" + _POSIX2_LOCALEDEF: + in-latest-posix: "" + _POSIX2_SW_DEV: + in-latest-posix: "" + _POSIX2_UPE: + in-latest-posix: "" + _XOPEN_CRYPT: + in-latest-posix: "" + _XOPEN_ENH_I18N: + in-latest-posix: "" + _XOPEN_REALTIME: + in-latest-posix: "" + _XOPEN_REALTIME_THREADS: + in-latest-posix: "" + _XOPEN_SHM: + in-latest-posix: "" + _XOPEN_UNIX: + in-latest-posix: "" + _XOPEN_UUCP: + in-latest-posix: "" + + _POSIX_ASYNC_IO: + in-latest-posix: "" + _POSIX_FALLOC: + in-latest-posix: "" + _POSIX_PRIO_IO: + in-latest-posix: "" + _POSIX_SYNC_IO: + in-latest-posix: "" + _POSIX_TIMESTAMP_RESOLUTION: + in-latest-posix: "" + _POSIX2_SYMLINKS: + in-latest-posix: "" + + F_OK: + in-latest-posix: "" + R_OK: + in-latest-posix: "" + W_OK: + in-latest-posix: "" + X_OK: + in-latest-posix: "" + + _CS_PATH: + in-latest-posix: "" + _CS_POSIX_V8_ILP32_OFF32_CFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_ILP32_OFF32_LDFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_ILP32_OFF32_LIBS: + in-latest-posix: "" + _CS_POSIX_V8_ILP32_OFFBIG_CFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_ILP32_OFFBIG_LDFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_ILP32_OFFBIG_LIBS: + in-latest-posix: "" + _CS_POSIX_V8_LP64_OFF64_CFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_LP64_OFF64_LDFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_LP64_OFF64_LIBS: + in-latest-posix: "" + _CS_POSIX_V8_LPBIG_OFFBIG_CFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_LPBIG_OFFBIG_LDFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_LPBIG_OFFBIG_LIBS: + in-latest-posix: "" + _CS_POSIX_V8_THREADS_CFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_THREADS_LDFLAGS: + in-latest-posix: "" + _CS_POSIX_V8_WIDTH_RESTRICTED_ENVS: + in-latest-posix: "" + _CS_V8_ENV: + in-latest-posix: "" + + SEEK_END: + in-latest-posix: "" + SEEK_SET: + in-latest-posix: "" + + F_LOCK: + in-latest-posix: "" + F_TEST: + in-latest-posix: "" + F_TLOCK: + in-latest-posix: "" + F_ULOCK: + in-latest-posix: "" + + _PC_2_SYMLINKS: + in-latest-posix: "" + _PC_ALLOC_SIZE_MIN: + in-latest-posix: "" + _PC_ASYNC_IO: + in-latest-posix: "" + _PC_CHOWN_RESTRICTED: + in-latest-posix: "" + _PC_FALLOC: + in-latest-posix: "" + _PC_FILESIZEBITS: + in-latest-posix: "" + _PC_LINK_MAX: + in-latest-posix: "" + _PC_MAX_CANON: + in-latest-posix: "" + _PC_MAX_INPUT: + in-latest-posix: "" + _PC_NAME_MAX: + in-latest-posix: "" + _PC_NO_TRUNC: + in-latest-posix: "" + _PC_PATH_MAX: + in-latest-posix: "" + _PC_PIPE_BUF: + in-latest-posix: "" + _PC_PRIO_IO: + in-latest-posix: "" + _PC_REC_INCR_XFER_SIZE: + in-latest-posix: "" + _PC_REC_MAX_XFER_SIZE: + in-latest-posix: "" + _PC_REC_MIN_XFER_SIZE: + in-latest-posix: "" + _PC_REC_XFER_ALIGN: + in-latest-posix: "" + _PC_SYMLINK_MAX: + in-latest-posix: "" + _PC_SYNC_IO: + in-latest-posix: "" + _PC_TEXTDOMAIN_MAX: + in-latest-posix: "" + _PC_TIMESTAMP_RESOLUTION: + in-latest-posix: "" + _PC_VDISABLE: + in-latest-posix: "" + + _SC_2_C_BIND: + in-latest-posix: "" + _SC_2_C_DEV: + in-latest-posix: "" + _SC_2_CHAR_TERM: + in-latest-posix: "" + _SC_2_FORT_RUN: + in-latest-posix: "" + _SC_2_LOCALEDEF: + in-latest-posix: "" + _SC_2_SW_DEV: + in-latest-posix: "" + _SC_2_UPE: + in-latest-posix: "" + _SC_2_VERSION: + in-latest-posix: "" + _SC_ADVISORY_INFO: + in-latest-posix: "" + _SC_AIO_LISTIO_MAX: + in-latest-posix: "" + _SC_AIO_MAX: + in-latest-posix: "" + _SC_AIO_PRIO_DELTA_MAX: + in-latest-posix: "" + _SC_ARG_MAX: + in-latest-posix: "" + _SC_ASYNCHRONOUS_IO: + in-latest-posix: "" + _SC_ATEXIT_MAX: + in-latest-posix: "" + _SC_BARRIERS: + in-latest-posix: "" + _SC_BC_BASE_MAX: + in-latest-posix: "" + _SC_BC_DIM_MAX: + in-latest-posix: "" + _SC_BC_SCALE_MAX: + in-latest-posix: "" + _SC_BC_STRING_MAX: + in-latest-posix: "" + _SC_CHILD_MAX: + in-latest-posix: "" + _SC_CLK_TCK: + in-latest-posix: "" + _SC_CLOCK_SELECTION: + in-latest-posix: "" + _SC_COLL_WEIGHTS_MAX: + in-latest-posix: "" + _SC_CPUTIME: + in-latest-posix: "" + _SC_DELAYTIMER_MAX: + in-latest-posix: "" + _SC_DEVICE_CONTROL: + in-latest-posix: "" + _SC_EXPR_NEST_MAX: + in-latest-posix: "" + _SC_FSYNC: + in-latest-posix: "" + _SC_GETGR_R_SIZE_MAX: + in-latest-posix: "" + _SC_GETPW_R_SIZE_MAX: + in-latest-posix: "" + _SC_HOST_NAME_MAX: + in-latest-posix: "" + _SC_IOV_MAX: + in-latest-posix: "" + _SC_IPV6: + in-latest-posix: "" + _SC_JOB_CONTROL: + in-latest-posix: "" + _SC_LINE_MAX: + in-latest-posix: "" + _SC_LOGIN_NAME_MAX: + in-latest-posix: "" + _SC_MAPPED_FILES: + in-latest-posix: "" + _SC_MEMLOCK: + in-latest-posix: "" + _SC_MEMLOCK_RANGE: + in-latest-posix: "" + _SC_MEMORY_PROTECTION: + in-latest-posix: "" + _SC_MESSAGE_PASSING: + in-latest-posix: "" + _SC_MONOTONIC_CLOCK: + in-latest-posix: "" + _SC_MQ_OPEN_MAX: + in-latest-posix: "" + _SC_MQ_PRIO_MAX: + in-latest-posix: "" + _SC_NGROUPS_MAX: + in-latest-posix: "" + _SC_NPROCESSORS_CONF: + in-latest-posix: "" + _SC_NPROCESSORS_ONLN: + in-latest-posix: "" + _SC_NSIG: + in-latest-posix: "" + _SC_OPEN_MAX: + in-latest-posix: "" + _SC_PAGE_SIZE: + in-latest-posix: "" + _SC_PAGESIZE: + in-latest-posix: "" + _SC_PRIORITIZED_IO: + in-latest-posix: "" + _SC_PRIORITY_SCHEDULING: + in-latest-posix: "" + _SC_RAW_SOCKETS: + in-latest-posix: "" + _SC_RE_DUP_MAX: + in-latest-posix: "" + _SC_READER_WRITER_LOCKS: + in-latest-posix: "" + _SC_REALTIME_SIGNALS: + in-latest-posix: "" + _SC_REGEXP: + in-latest-posix: "" + _SC_RTSIG_MAX: + in-latest-posix: "" + _SC_SAVED_IDS: + in-latest-posix: "" + _SC_SEM_NSEMS_MAX: + in-latest-posix: "" + _SC_SEM_VALUE_MAX: + in-latest-posix: "" + _SC_SEMAPHORES: + in-latest-posix: "" + _SC_SHARED_MEMORY_OBJECTS: + in-latest-posix: "" + _SC_SHELL: + in-latest-posix: "" + _SC_SIGQUEUE_MAX: + in-latest-posix: "" + _SC_SPAWN: + in-latest-posix: "" + _SC_SPIN_LOCKS: + in-latest-posix: "" + _SC_SPORADIC_SERVER: + in-latest-posix: "" + _SC_SS_REPL_MAX: + in-latest-posix: "" + _SC_STREAM_MAX: + in-latest-posix: "" + _SC_SYMLOOP_MAX: + in-latest-posix: "" + _SC_SYNCHRONIZED_IO: + in-latest-posix: "" + _SC_THREAD_ATTR_STACKADDR: + in-latest-posix: "" + _SC_THREAD_ATTR_STACKSIZE: + in-latest-posix: "" + _SC_THREAD_CPUTIME: + in-latest-posix: "" + _SC_THREAD_DESTRUCTOR_ITERATIONS: + in-latest-posix: "" + _SC_THREAD_KEYS_MAX: + in-latest-posix: "" + _SC_THREAD_PRIO_INHERIT: + in-latest-posix: "" + _SC_THREAD_PRIO_PROTECT: + in-latest-posix: "" + _SC_THREAD_PRIORITY_SCHEDULING: + in-latest-posix: "" + _SC_THREAD_PROCESS_SHARED: + in-latest-posix: "" + _SC_THREAD_ROBUST_PRIO_INHERIT: + in-latest-posix: "" + _SC_THREAD_ROBUST_PRIO_PROTECT: + in-latest-posix: "" + _SC_THREAD_SAFE_FUNCTIONS: + in-latest-posix: "" + _SC_THREAD_SPORADIC_SERVER: + in-latest-posix: "" + _SC_THREAD_STACK_MIN: + in-latest-posix: "" + _SC_THREAD_THREADS_MAX: + in-latest-posix: "" + _SC_THREADS: + in-latest-posix: "" + _SC_TIMEOUTS: + in-latest-posix: "" + _SC_TIMER_MAX: + in-latest-posix: "" + _SC_TIMERS: + in-latest-posix: "" + _SC_TTY_NAME_MAX: + in-latest-posix: "" + _SC_TYPED_MEMORY_OBJECTS: + in-latest-posix: "" + _SC_TZNAME_MAX: + in-latest-posix: "" + _SC_V8_ILP32_OFF32: + in-latest-posix: "" + _SC_V8_ILP32_OFFBIG: + in-latest-posix: "" + _SC_V8_LP64_OFF64: + in-latest-posix: "" + _SC_V8_LPBIG_OFFBIG: + in-latest-posix: "" + _SC_V7_ILP32_OFF32: + in-latest-posix: "" + _SC_V7_ILP32_OFFBIG: + in-latest-posix: "" + _SC_V7_LP64_OFF64: + in-latest-posix: "" + _SC_V7_LPBIG_OFFBIG: + in-latest-posix: "" + _SC_VERSION: + in-latest-posix: "" + _SC_XOPEN_CRYPT: + in-latest-posix: "" + _SC_XOPEN_ENH_I18N: + in-latest-posix: "" + _SC_XOPEN_REALTIME: + in-latest-posix: "" + _SC_XOPEN_REALTIME_THREADS: + in-latest-posix: "" + _SC_XOPEN_SHM: + in-latest-posix: "" + _SC_XOPEN_UNIX: + in-latest-posix: "" + _SC_XOPEN_UUCP: + in-latest-posix: "" + _SC_XOPEN_VERSION: + in-latest-posix: "" + + STDERR_FILENO: + in-latest-posix: "" + STDIN_FILENO: + in-latest-posix: "" + STDOUT_FILENO: + in-latest-posix: "" + _POSIX_VDISABLE: + in-latest-posix: "" + POSIX_CLOSE_RESTART: + in-latest-posix: "" + +functions: + access: + in-latest-posix: "" + alarm: + in-latest-posix: "" + chdir: + in-latest-posix: "" + chown: + in-latest-posix: "" + close: + in-latest-posix: "" + confstr: + in-latest-posix: "" + crypt: + in-latest-posix: "" + dup: + in-latest-posix: "" + dup2: + in-latest-posix: "" + dup3: + in-latest-posix: "" + _exit: + in-latest-posix: "" + encrypt: + in-latest-posix: "" + execl: + in-latest-posix: "" + execle: + in-latest-posix: "" + execlp: + in-latest-posix: "" + execv: + in-latest-posix: "" + execve: + in-latest-posix: "" + execvp: + in-latest-posix: "" + faccessat: + in-latest-posix: "" + fchdir: + in-latest-posix: "" + fchown: + in-latest-posix: "" + fchownat: + in-latest-posix: "" + fdatasync: + in-latest-posix: "" + fexecve: + in-latest-posix: "" + _Fork: + in-latest-posix: "" + fork: + in-latest-posix: "" + fpathconf: + in-latest-posix: "" + fsync: + in-latest-posix: "" + ftruncate: + in-latest-posix: "" + getcwd: + in-latest-posix: "" + getegid: + in-latest-posix: "" + getentropy: + in-latest-posix: "" + geteuid: + in-latest-posix: "" + getgid: + in-latest-posix: "" + getgroups: + in-latest-posix: "" + gethostid: + in-latest-posix: "" + gethostname: + in-latest-posix: "" + getlogin: + in-latest-posix: "" + getlogin_r: + in-latest-posix: "" + getopt: + in-latest-posix: "" + getpgid: + in-latest-posix: "" + getpgrp: + in-latest-posix: "" + getpid: + in-latest-posix: "" + getppid: + in-latest-posix: "" + getresgid: + in-latest-posix: "" + getresuid: + in-latest-posix: "" + getsid: + in-latest-posix: "" + getuid: + in-latest-posix: "" + isatty: + in-latest-posix: "" + lchown: + in-latest-posix: "" + link: + in-latest-posix: "" + linkat: + in-latest-posix: "" + lockf: + in-latest-posix: "" + lseek: + in-latest-posix: "" + nice: + in-latest-posix: "" + pathconf: + in-latest-posix: "" + pause: + in-latest-posix: "" + pipe: + in-latest-posix: "" + pipe2: + in-latest-posix: "" + posix_close: + in-latest-posix: "" + pread: + in-latest-posix: "" + pwrite: + in-latest-posix: "" + read: + in-latest-posix: "" + readlink: + in-latest-posix: "" + readlinkat: + in-latest-posix: "" + rmdir: + in-latest-posix: "" + setegid: + in-latest-posix: "" + seteuid: + in-latest-posix: "" + setgid: + in-latest-posix: "" + setpgid: + in-latest-posix: "" + setregid: + in-latest-posix: "" + setresgid: + in-latest-posix: "" + setresuid: + in-latest-posix: "" + setreuid: + in-latest-posix: "" + setsid: + in-latest-posix: "" + setuid: + in-latest-posix: "" + sleep: + in-latest-posix: "" + swab: + in-latest-posix: "" + symlink: + in-latest-posix: "" + symlinkat: + in-latest-posix: "" + sync: + in-latest-posix: "" + sysconf: + in-latest-posix: "" + tcgetpgrp: + in-latest-posix: "" + tcsetpgrp: + in-latest-posix: "" + truncate: + in-latest-posix: "" + ttyname: + in-latest-posix: "" + ttyname_r: + in-latest-posix: "" + unlink: + in-latest-posix: "" + unlinkat: + in-latest-posix: "" + write: + in-latest-posix: "" From e10d551aa482ee185a80216b2670a2947a8bdeb0 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Thu, 23 Jan 2025 13:45:46 -0600 Subject: [PATCH 206/208] [mlir][PDLL] Allow (and ignore) `-D` tablegen macros. (#124166) Similar to #91329, `mlir-pdll` is a tool used in tablegen macros that unregisters from common flags, including `-D` macros. Because a macro may be used globally, e.g. configured via `LLVM_TABLEGEN_FLAGS`, we want this tool to just ignore the macro instead of a fatal failure due to the unrecognized flag. --- mlir/tools/mlir-pdll/mlir-pdll.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mlir/tools/mlir-pdll/mlir-pdll.cpp b/mlir/tools/mlir-pdll/mlir-pdll.cpp index 0fcf8d1b12d60f..88a5f3639c9625 100644 --- a/mlir/tools/mlir-pdll/mlir-pdll.cpp +++ b/mlir/tools/mlir-pdll/mlir-pdll.cpp @@ -167,6 +167,15 @@ int main(int argc, char **argv) { "write-if-changed", llvm::cl::desc("Only write to the output file if it changed")); + // `ResetCommandLineParser` at the above unregistered the "D" option + // of `llvm-tblgen`, which causes tblgen usage to fail due to + // "Unknnown command line argument '-D...`" when a macros name is + // present. The following is a workaround to re-register it again. + llvm::cl::list macroNames( + "D", + llvm::cl::desc("Name of the macro to be defined -- ignored by mlir-pdll"), + llvm::cl::value_desc("macro name"), llvm::cl::Prefix); + llvm::InitLLVM y(argc, argv); llvm::cl::ParseCommandLineOptions(argc, argv, "PDLL Frontend"); From c4ca87ee78fa4f1978e018e6e3a260ac9aea399d Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 23 Jan 2025 12:22:16 -0800 Subject: [PATCH 207/208] [yaml2obj] Don't use uninitialized Type (#123274) Alternative to #123137 With -DMACHINE=EM_NONE, machine specific sections, like SHT_ARM_EXIDX, will fall to parse and set `Type`. This triggers msan on ``` yaml2obj llvm-project/llvm/test/tools/yaml2obj/ELF/mips-abi-flags.yaml -DMACHINE=EM_NONE ``` --- llvm/lib/ObjectYAML/ELFYAML.cpp | 2 +- .../test/tools/yaml2obj/ELF/section-type.yaml | 20 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 24f426a9aa1f7c..539834fc8d4dbf 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1588,7 +1588,7 @@ static bool isInteger(StringRef Val) { void MappingTraits>::mapping( IO &IO, std::unique_ptr &Section) { - ELFYAML::ELF_SHT Type; + ELFYAML::ELF_SHT Type = ELF::SHT_NULL; StringRef TypeStr; if (IO.outputting()) { if (auto *S = dyn_cast(Section.get())) diff --git a/llvm/test/tools/yaml2obj/ELF/section-type.yaml b/llvm/test/tools/yaml2obj/ELF/section-type.yaml index ad2edd942cc2aa..6f5f42aceafedc 100644 --- a/llvm/test/tools/yaml2obj/ELF/section-type.yaml +++ b/llvm/test/tools/yaml2obj/ELF/section-type.yaml @@ -1,5 +1,5 @@ -# RUN: yaml2obj %s -o %t -# RUN: llvm-readobj --sections %t | FileCheck %s +# RUN: yaml2obj %s --docnum=1 -o %t1 +# RUN: llvm-readobj --sections %t1 | FileCheck %s # CHECK: Name: enum # CHECK: Type: SHT_PROGBITS @@ -25,3 +25,19 @@ Sections: Type: 0xabcd - Name: decimal Type: 1234 + +## Check that we can handle unknown section and chunk types. +# RUN: not yaml2obj %s --docnum=2 -DSECTION_TYPE=UNKNOWN_TYPE -o %t2 2>&1 | FileCheck %s --check-prefix=UNKNOWN-TYPE +# RUN: not yaml2obj %s --docnum=2 -DSECTION_TYPE=SHT_UNKNOWN_TYPE -o %t2 2>&1 | FileCheck %s --check-prefix=UNKNOWN-TYPE + +# UNKNOWN-TYPE: error: invalid hex32 number + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .foo + Type: [[SECTION_TYPE]] + From 7a831eb924e34e9c5e62f3b5a8e0db0278284f84 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 23 Jan 2025 20:38:47 +0000 Subject: [PATCH 208/208] [VPlan] Remove unused VPLane::getNumCachedLanes. (NFC) The function isn't used, remove it. --- llvm/lib/Transforms/Vectorize/VPlan.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 16c64f32ab6349..b52ee3c2428f3f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -223,12 +223,6 @@ class VPLane { return Lane; } } - - /// Returns the maxmimum number of lanes that we are able to consider - /// caching for \p VF. - static unsigned getNumCachedLanes(const ElementCount &VF) { - return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); - } }; /// VPTransformState holds information passed down when "executing" a VPlan,