Skip to content

Commit

Permalink
AMDGPU/EG,CM: Implement fsqrt using recip(rsqrt(x)) instead of x * rs…
Browse files Browse the repository at this point in the history
…qrt(x)

The old version might be faster on EG (RECIP_IEEE is Trans only),
but it'd need extra corner case checks.
This gives correct corner case behaviour and saves a register.
Fixes OCL CTS sqrt test (1-thread, scalar) on Turks.

Reviewer: arsenm
Differential Revision: https://reviews.llvm.org/D74017
  • Loading branch information
jvesely committed Feb 5, 2020
1 parent 8c3e6af commit e6686ad
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 18 deletions.
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/CaymanInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def COS_cm : COS_Common<0x8E>;

def : RsqPat<RECIPSQRT_IEEE_cm, f32>;

def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;

def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;

defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
Expand All @@ -70,8 +72,6 @@ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {



def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;

class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
CF_MEM_RAT_CACHELESS <0x14, 0, mask,
(ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/EvergreenInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,12 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;

def SIN_eg : SIN_Common<0x8D>;
def COS_eg : COS_Common<0x8E>;

def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
} // End SubtargetPredicate = isEG

//===----------------------------------------------------------------------===//
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AMDGPU/R600Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1233,6 +1233,11 @@ def : R600Pat<
def : RcpPat<recip_ieee, f32>;
}

class SqrtPat<Instruction RsqInst, Instruction RecipInst> : R600Pat <
(fsqrt f32:$src),
(RecipInst (RsqInst $src))
>;

//===----------------------------------------------------------------------===//
// R600 / R700 Instructions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1272,8 +1277,8 @@ let Predicates = [isR600] in {
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;

def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;

def R600_ExportSwz : ExportSwzInst {
let Word1{20-17} = 0; // BURST_COUNT
Expand Down
38 changes: 24 additions & 14 deletions llvm/test/CodeGen/AMDGPU/fsqrt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float ad
; FUNC-LABEL: {{^}}s_sqrt_f32:
; GCN: v_sqrt_f32_e32

; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
; R600: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].Z
; R600: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
entry:
%fdiv = call float @llvm.sqrt.f32(float %in)
Expand All @@ -40,10 +40,10 @@ entry:
; GCN: v_sqrt_f32_e32
; GCN: v_sqrt_f32_e32

; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].W
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].X
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
entry:
%fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
Expand All @@ -57,14 +57,14 @@ entry:
; GCN: v_sqrt_f32_e32
; GCN: v_sqrt_f32_e32

; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Y
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Z
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].W
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[4].X
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
entry:
%fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
Expand Down Expand Up @@ -134,6 +134,16 @@ entry:
ret void
}

; FUNC-LABEL: {{^}}recip_sqrt:
; R600: RECIPSQRT_IEEE
; R600-NOT: RECIP_IEEE
define amdgpu_kernel void @recip_sqrt(float addrspace(1)* %out, float %src) nounwind {
%sqrt = call float @llvm.sqrt.f32(float %src)
%recipsqrt = fdiv fast float 1.0, %sqrt
store float %recipsqrt, float addrspace(1)* %out, align 4
ret void
}

declare float @llvm.sqrt.f32(float %in) #0
declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
Expand Down

0 comments on commit e6686ad

Please sign in to comment.