diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e4533570f75086..2e065a938a3109 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41500,6 +41500,21 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res))); } + // TODO: We can generalize this for other shuffles/conversions. + if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS && + N1.getOpcode() == SrcOpcode && + N0.getValueType() == N1.getValueType() && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && + ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() && + IsSafeToMoveShuffle(N0, SrcOpcode) && + IsSafeToMoveShuffle(N1, SrcOpcode)) { + EVT OpSrcVT = N0.getOperand(0).getValueType(); + EVT OpDstVT = N0.getValueType(); + SDValue Res = + DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0)); + return DAG.getBitcast(ShuffleVT, + DAG.getNode(SrcOpcode, DL, OpDstVT, Res)); + } } break; } diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index ef0f3f3e816dfa..5148d1566c6294 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -4966,22 +4966,18 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; ; F16C-LABEL: fptosi_2f16_to_4i32: ; F16C: # %bb.0: -; F16C-NEXT: vpsrld $16, %xmm0, %xmm1 -; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 ; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; F16C-NEXT: retq ; ; AVX512-LABEL: fptosi_2f16_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq @@ -5084,11 +5080,9 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind { ; ; F16C-LABEL: fptoui_2f16_to_4i32: ; F16C: # %bb.0: -; F16C-NEXT: vpsrld $16, %xmm0, %xmm1 -; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; F16C-NEXT: vcvttps2dq %xmm0, %xmm1 ; F16C-NEXT: vpsrad $31, %xmm1, %xmm2 ; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -5100,11 +5094,9 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind { ; ; AVX512F-LABEL: fptoui_2f16_to_4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vzeroupper @@ -5112,11 +5104,9 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind { ; ; AVX512-FASTLANE-LABEL: fptoui_2f16_to_4i32: ; AVX512-FASTLANE: # %bb.0: -; AVX512-FASTLANE-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-FASTLANE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-FASTLANE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-FASTLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-FASTLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-FASTLANE-NEXT: vcvttps2udq %xmm0, %xmm0 ; AVX512-FASTLANE-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-FASTLANE-NEXT: retq