diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 5af46989aca97..bee4c47a23ba6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -228,16 +228,39 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI< // Pseudo instructions used for @llvm.fptrunc.round. The final codegen is done // in the ModeRegister pass. let Uses = [MODE, EXEC] in { +let True16Predicate = NotHasTrue16BitInsts in def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), (ins VGPR_32:$src0, i32imm:$round)>; +let True16Predicate = UseFakeTrue16Insts in +def FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32:$src0, i32imm:$round)>; + +let True16Predicate = UseRealTrue16Insts in +// The operands of these pseudos should match V_CVT_F16_F32_t16_e64 +def FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 : VPseudoInstSI <(outs VOPDstOperand_t16:$vdst), + (ins FP32InputMods:$src0_modifiers, VSrc_f32:$src0, Clamp0:$clamp, omod0:$omod, op_sel0:$op_sel, i32imm:$round)> { + let FPClamp = 1; + let ClampLo = 1; + let UseNamedOperandTable = 1; +} + def FPTRUNC_ROUND_F32_F64_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), (ins VReg_64:$src0, i32imm:$round)>; } // End Uses = [MODE, EXEC] +let True16Predicate = NotHasTrue16BitInsts in def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))), (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>; +let True16Predicate = UseFakeTrue16Insts in +def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))), + (FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $src0, (as_hw_round_mode $round))>; + +let True16Predicate = UseRealTrue16Insts in +def : GCNPat <(f16 (fptrunc_round (f32 (VOP3OpSelMods f32:$src0, i32:$src0_modifiers)), (i32 SupportedRoundMode:$round))), + (FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 $src0_modifiers, $src0, (as_hw_round_mode $round))>; + def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))), (FPTRUNC_ROUND_F32_F64_PSEUDO $src0, (as_hw_round_mode $round))>; diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index 412e2f2fe45d1..99aea52c184d6 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -166,6 +166,8 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI, unsigned Opcode = MI.getOpcode(); if (TII->usesFPDPRounding(MI) || Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO || + Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 || + Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 || Opcode == AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO) { switch (Opcode) { case AMDGPU::V_INTERP_P1LL_F16: @@ -177,19 +179,19 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI, case AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO: { unsigned Mode = MI.getOperand(2).getImm(); MI.removeOperand(2); - // Replacing the pseudo by a real instruction in place - if (TII->getSubtarget().hasTrue16BitInsts()) { - MachineBasicBlock &MBB = *MI.getParent(); - MachineInstrBuilder B(*MBB.getParent(), MI); - MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_fake16_e64)); - MachineOperand Src0 = MI.getOperand(1); - MI.removeOperand(1); - B.addImm(0); // src0_modifiers - B.add(Src0); // re-add src0 operand - B.addImm(0); // clamp - B.addImm(0); // omod - } else - MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32)); + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32)); + return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode)); + } + case AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32: { + unsigned Mode = MI.getOperand(2).getImm(); + MI.removeOperand(2); + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_fake16_e32)); + return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode)); + } + case AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64: { + unsigned Mode = MI.getOperand(6).getImm(); + MI.removeOperand(6); + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_t16_e64)); return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode)); } case AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO: { diff --git a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.gfx11plus-fake16.mir b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.gfx11plus-fake16.mir new file mode 100644 index 0000000000000..8667934d70ff0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.gfx11plus-fake16.mir @@ -0,0 +1,73 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass si-mode-register %s -o - | FileCheck %s --check-prefixes=GFX11 + +--- +name: ftrunc_tonearest + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_tonearest + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: $vgpr1 = V_CVT_F16_F32_fake16_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $vgpr0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: ftrunc_upward + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_upward + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: S_SETREG_IMM32_B32 1, 129, implicit-def $mode, implicit $mode + ; GFX11-NEXT: $vgpr1 = V_CVT_F16_F32_fake16_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $vgpr0, 1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: ftrunc_downward + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_downward + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: S_SETREG_IMM32_B32 1, 193, implicit-def $mode, implicit $mode + ; GFX11-NEXT: $vgpr0 = V_CVT_F16_F32_fake16_e32 $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr0 = FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $vgpr1, 2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: ftrunc_towardzero + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_towardzero + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode + ; GFX11-NEXT: $vgpr1 = V_CVT_F16_F32_fake16_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $vgpr0, 3, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.gfx11plus.mir b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.gfx11plus.mir new file mode 100644 index 0000000000000..9b439af4bf7b4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.gfx11plus.mir @@ -0,0 +1,73 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass si-mode-register %s -o - | FileCheck %s --check-prefixes=GFX11 + +--- +name: ftrunc_tonearest + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_tonearest + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: $vgpr1_hi16 = V_CVT_F16_F32_t16_e64 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1_hi16 = FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 0, $vgpr0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: ftrunc_upward + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_upward + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: S_SETREG_IMM32_B32 1, 129, implicit-def $mode, implicit $mode + ; GFX11-NEXT: $vgpr1_hi16 = V_CVT_F16_F32_t16_e64 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1_hi16 = FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 0, $vgpr0, 0, 0, 0, 1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: ftrunc_downward + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_downward + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: S_SETREG_IMM32_B32 1, 193, implicit-def $mode, implicit $mode + ; GFX11-NEXT: $vgpr0_hi16 = V_CVT_F16_F32_t16_e64 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr0_hi16 = FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 0, $vgpr1, 0, 0, 0, 2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: ftrunc_towardzero + +body: | + bb.0: + liveins: $sgpr0 + ; GFX11-LABEL: name: ftrunc_towardzero + ; GFX11: liveins: $sgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX11-NEXT: S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode + ; GFX11-NEXT: $vgpr1_hi16 = V_CVT_F16_F32_t16_e64 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1_hi16 = FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 0, $vgpr0, 0, 0, 0, 3, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir index 67eb719fd2c0d..a28f347603ab7 100644 --- a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir +++ b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir @@ -1,7 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-mode-register %s -o - | FileCheck %s --check-prefixes=CHECK # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-mode-register %s -o - | FileCheck %s --check-prefixes=CHECK -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-mode-register %s -o - | FileCheck %s --check-prefixes=GFX11 --- name: ftrunc_tonearest @@ -15,13 +14,6 @@ body: | ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; CHECK-NEXT: $vgpr1 = V_CVT_F16_F32_e32 $vgpr0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 - ; - ; GFX11-LABEL: name: ftrunc_tonearest - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - ; GFX11-NEXT: $vgpr1 = V_CVT_F16_F32_fake16_e64 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0 $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec $vgpr1 = FPTRUNC_ROUND_F16_F32_PSEUDO $vgpr0, 0, implicit $mode, implicit $exec S_ENDPGM 0 @@ -39,14 +31,6 @@ body: | ; CHECK-NEXT: S_SETREG_IMM32_B32 1, 129, implicit-def $mode, implicit $mode ; CHECK-NEXT: $vgpr1 = V_CVT_F16_F32_e32 $vgpr0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 - ; - ; GFX11-LABEL: name: ftrunc_upward - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - ; GFX11-NEXT: S_SETREG_IMM32_B32 1, 129, implicit-def $mode, implicit $mode - ; GFX11-NEXT: $vgpr1 = V_CVT_F16_F32_fake16_e64 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0 $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec $vgpr1 = FPTRUNC_ROUND_F16_F32_PSEUDO $vgpr0, 1, implicit $mode, implicit $exec S_ENDPGM 0 @@ -64,14 +48,6 @@ body: | ; CHECK-NEXT: S_SETREG_IMM32_B32 1, 193, implicit-def $mode, implicit $mode ; CHECK-NEXT: $vgpr0 = V_CVT_F16_F32_e32 $vgpr1, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 - ; - ; GFX11-LABEL: name: ftrunc_downward - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - ; GFX11-NEXT: S_SETREG_IMM32_B32 1, 193, implicit-def $mode, implicit $mode - ; GFX11-NEXT: $vgpr0 = V_CVT_F16_F32_fake16_e64 0, $vgpr1, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0 $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec $vgpr0 = FPTRUNC_ROUND_F16_F32_PSEUDO $vgpr1, 2, implicit $mode, implicit $exec S_ENDPGM 0 @@ -89,14 +65,6 @@ body: | ; CHECK-NEXT: S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode ; CHECK-NEXT: $vgpr1 = V_CVT_F16_F32_e32 $vgpr0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 - ; - ; GFX11-LABEL: name: ftrunc_towardzero - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - ; GFX11-NEXT: S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode - ; GFX11-NEXT: $vgpr1 = V_CVT_F16_F32_fake16_e64 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0 $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec $vgpr1 = FPTRUNC_ROUND_F16_F32_PSEUDO $vgpr0, 3, implicit $mode, implicit $exec S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 4a1287c56ea8e..0ad1c30b5b5a4 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9ALL,GFX900 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9ALL,GFX906 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define i16 @shl_i16(i16 %x, i16 %y) { ; GFX8-LABEL: shl_i16: @@ -24,11 +25,17 @@ define i16 @shl_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_lshlrev_b16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: shl_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: shl_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: shl_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v1, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = shl i16 %x, %y ret i16 %res } @@ -52,11 +59,17 @@ define i16 @lshr_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_lshrrev_b16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: lshr_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: lshr_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: lshr_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, v1, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = lshr i16 %x, %y ret i16 %res } @@ -80,11 +93,17 @@ define i16 @ashr_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_ashrrev_i16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: ashr_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_ashrrev_i16 v0, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: ashr_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: ashr_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, v1, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = ashr i16 %x, %y ret i16 %res } @@ -108,11 +127,18 @@ define i16 @add_u16(i16 %x, i16 %y) { ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_u16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: add_u16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: add_u16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = add i16 %x, %y ret i16 %res } @@ -136,11 +162,18 @@ define i16 @sub_u16(i16 %x, i16 %y) { ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: sub_u16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: sub_u16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: sub_u16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = sub i16 %x, %y ret i16 %res } @@ -164,11 +197,18 @@ define i16 @mul_lo_u16(i16 %x, i16 %y) { ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_lo_u16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: mul_lo_u16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: mul_lo_u16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = mul i16 %x, %y ret i16 %res } @@ -192,11 +232,18 @@ define i16 @min_u16(i16 %x, i16 %y) { ; GFX10-NEXT: v_min_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: min_u16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_u16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: min_u16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: min_u16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp ule i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y ret i16 %res @@ -221,11 +268,18 @@ define i16 @min_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_min_i16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: min_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_i16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: min_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: min_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_i16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp sle i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y ret i16 %res @@ -250,11 +304,18 @@ define i16 @max_u16(i16 %x, i16 %y) { ; GFX10-NEXT: v_max_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: max_u16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_u16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: max_u16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: max_u16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp uge i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y ret i16 %res @@ -279,11 +340,18 @@ define i16 @max_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_max_i16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: max_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_i16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: max_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: max_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_i16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp sge i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y ret i16 %res @@ -309,12 +377,19 @@ define i32 @shl_i16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: shl_i16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: shl_i16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: shl_i16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = shl i16 %x, %y %zext = zext i16 %res to i32 ret i32 %zext @@ -340,12 +415,19 @@ define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: lshr_i16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: lshr_i16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: lshr_i16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = lshr i16 %x, %y %zext = zext i16 %res to i32 ret i32 %zext @@ -371,12 +453,19 @@ define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: ashr_i16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_ashrrev_i16 v0, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: ashr_i16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: ashr_i16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = ashr i16 %x, %y %zext = zext i16 %res to i32 ret i32 %zext @@ -402,12 +491,20 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_u16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: add_u16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: add_u16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = add i16 %x, %y %zext = zext i16 %res to i32 ret i32 %zext @@ -433,12 +530,20 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: sub_u16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: sub_u16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: sub_u16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = sub i16 %x, %y %zext = zext i16 %res to i32 ret i32 %zext @@ -464,12 +569,20 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_lo_u16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: mul_lo_u16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: mul_lo_u16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %res = mul i16 %x, %y %zext = zext i16 %res to i32 ret i32 %zext @@ -495,12 +608,20 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: min_u16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_u16 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: min_u16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: min_u16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp ule i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y %zext = zext i16 %res to i32 @@ -527,12 +648,20 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: min_i16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_i16 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: min_i16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: min_i16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_i16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp sle i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y %zext = zext i16 %res to i32 @@ -559,12 +688,20 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: max_u16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_u16 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: max_u16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: max_u16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp uge i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y %zext = zext i16 %res to i32 @@ -591,12 +728,20 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: max_i16_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_i16 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: max_i16_zext_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: max_i16_zext_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_i16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp sge i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y %zext = zext i16 %res to i32 @@ -623,12 +768,20 @@ define i32 @zext_fadd_f16(half %x, half %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: zext_fadd_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: zext_fadd_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: zext_fadd_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %add = fadd half %x, %y %cast = bitcast half %add to i16 %zext = zext i16 %cast to i32 @@ -656,12 +809,21 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: zext_fma_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: zext_fma_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: zext_fma_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fma = call half @llvm.fma.f16(half %x, half %y, half %z) %cast = bitcast half %fma to i16 %zext = zext i16 %cast to i32 @@ -689,12 +851,21 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: zext_div_fixup_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_fixup_f16 v0, v0, v1, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: zext_div_fixup_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: zext_div_fixup_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_div_fixup_f16 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %div.fixup = call half @llvm.amdgcn.div.fixup.f16(half %x, half %y, half %z) %cast = bitcast half %div.fixup to i16 %zext = zext i16 %cast to i32 @@ -724,12 +895,19 @@ define i32 @zext_fptrunc_f16(float %x) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: zext_fptrunc_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: zext_fptrunc_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: zext_fptrunc_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fptrunc = fptrunc float %x to half %cast = bitcast half %fptrunc to i16 %zext = zext i16 %cast to i32 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll index 0f80327638a9c..52d882590cbce 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll @@ -2,8 +2,9 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { ; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: @@ -20,11 +21,23 @@ define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { ; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -58,13 +71,29 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX1011-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX1011-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -103,14 +132,33 @@ define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x flo ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX1011-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX1011-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX1011-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v2 +; GFX11-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -181,12 +229,26 @@ define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) # ; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX1011-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") %neg.val = fneg half %val ret half %neg.val @@ -207,11 +269,23 @@ define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) # ; GFX89-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, -v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.arg = fneg float %arg %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val @@ -255,11 +329,23 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi(float %arg, ; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret void } @@ -316,14 +402,23 @@ define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x flo ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: global_store_b32 v[2:3], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-TRUE16-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") store <2 x half> %result, ptr addrspace(1) %ptr ret void @@ -344,11 +439,23 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg(float % ; GFX89-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, -v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.arg = fneg float %arg %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret void @@ -369,11 +476,23 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs(float % ; GFX89-NEXT: v_cvt_f16_f32_e64 v0, |v0| ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |v0| +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %abs.arg = call float @llvm.fabs.f32(float %arg) #0 %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret void