Skip to content

Commit 389b67f

Browse files
Arm: Add NEON and MVE complex mul, mla and mls patterns.
This adds implementation for the optabs for complex operations. With this the following C code: void g (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] * b[i]; } generates NEON: g: vmov.f32 q11, #0.0 @ v4sf add r3, r2, #1600 .L2: vmov q8, q11 @ v4sf vld1.32 {q10}, [r1]! vld1.32 {q9}, [r0]! vcmla.f32 q8, q9, q10, #0 vcmla.f32 q8, q9, q10, #90 vst1.32 {q8}, [r2]! cmp r3, r2 bne .L2 bx lr MVE: g: push {lr} mov lr, #100 dls lr, lr .L2: vldrw.32 q1, [r1], #16 vldrw.32 q2, [r0], #16 vcmul.f32 q3, q2, q1, #0 vcmla.f32 q3, q2, q1, #90 vstrw.32 q3, [r2], #16 le lr, .L2 ldr pc, [sp], #4 instead of g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q14, q11, q9 vmul.f32 q15, q11, q8 vneg.f32 q14, q14 vfma.f32 q15, q10, q9 vfma.f32 q14, q10, q8 vmov q13, q15 @ v4sf vmov q12, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr and g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q15, q10, q8 vmul.f32 q14, q10, q9 vmls.f32 q15, q11, q9 vmla.f32 q14, q11, q8 vmov q12, q15 @ v4sf vmov q13, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr respectively. gcc/ChangeLog: * config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1, VCMLA_OP, VCMUL_OP): New. * config/arm/mve.md (mve_vcmlaq<mve_rot><mode>): Support vec_dup 0. * config/arm/neon.md (cmul<conj_op><mode>3): New. * config/arm/unspecs.md (UNSPEC_VCMLA_CONJ, UNSPEC_VCMLA180_CONJ, UNSPEC_VCMUL_CONJ): New. * config/arm/vec-common.md (cmul<conj_op><mode>3, arm_vcmla<rot><mode>, cml<fcmac1><conj_op><mode>4): New.
1 parent 02551aa commit 389b67f

File tree

5 files changed

+126
-6
lines changed

5 files changed

+126
-6
lines changed

gcc/config/arm/iterators.md

+40
Original file line numberDiff line numberDiff line change
@@ -1186,6 +1186,33 @@
11861186
(UNSPEC_VCMLA180 "180")
11871187
(UNSPEC_VCMLA270 "270")])
11881188

1189+
;; The complex operations when performed on a real complex number require two
1190+
;; instructions to perform the operation. e.g. complex multiplication requires
1191+
;; two VCMUL with a particular rotation value.
1192+
;;
1193+
;; These values can be looked up in rotsplit1 and rotsplit2. as an example
1194+
;; VCMUL needs the first instruction to use #0 and the second #90.
1195+
(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0")
1196+
(UNSPEC_VCMLA_CONJ "0")
1197+
(UNSPEC_VCMUL "0")
1198+
(UNSPEC_VCMUL_CONJ "0")
1199+
(UNSPEC_VCMLA180 "180")
1200+
(UNSPEC_VCMLA180_CONJ "180")])
1201+
1202+
(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
1203+
(UNSPEC_VCMLA_CONJ "270")
1204+
(UNSPEC_VCMUL "90")
1205+
(UNSPEC_VCMUL_CONJ "270")
1206+
(UNSPEC_VCMLA180 "270")
1207+
(UNSPEC_VCMLA180_CONJ "90")])
1208+
1209+
(define_int_attr conj_op [(UNSPEC_VCMLA180 "")
1210+
(UNSPEC_VCMLA180_CONJ "_conj")
1211+
(UNSPEC_VCMLA "")
1212+
(UNSPEC_VCMLA_CONJ "_conj")
1213+
(UNSPEC_VCMUL "")
1214+
(UNSPEC_VCMUL_CONJ "_conj")])
1215+
11891216
(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
11901217
(UNSPEC_VCADD270 "_rot270")
11911218
(UNSPEC_VCMLA "")
@@ -1200,6 +1227,9 @@
12001227
(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
12011228
UNSPEC_VCMUL180 UNSPEC_VCMUL270])
12021229

1230+
(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a")
1231+
(UNSPEC_VCMLA180 "s") (UNSPEC_VCMLA180_CONJ "s")])
1232+
12031233
(define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
12041234
(UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8")
12051235
(UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8")
@@ -1723,3 +1753,13 @@
17231753
(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
17241754
(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
17251755
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
1756+
1757+
;; Define iterators for VCMLA operations
1758+
(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
1759+
UNSPEC_VCMLA_CONJ
1760+
UNSPEC_VCMLA180
1761+
UNSPEC_VCMLA180_CONJ])
1762+
1763+
;; Define iterators for VCMLA operations as MUL
1764+
(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
1765+
UNSPEC_VCMUL_CONJ])

gcc/config/arm/mve.md

+7-6
Original file line numberDiff line numberDiff line change
@@ -4101,15 +4101,16 @@
41014101
(define_insn "mve_vcmlaq<mve_rot><mode>"
41024102
[
41034103
(set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
4104-
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
4105-
(match_operand:MVE_0 2 "s_register_operand" "w,w")
4106-
(match_operand:MVE_0 3 "s_register_operand" "w,w")]
4107-
VCMLA))
4104+
(plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand" "Dz,0")
4105+
(unspec:MVE_0
4106+
[(match_operand:MVE_0 2 "s_register_operand" "w,w")
4107+
(match_operand:MVE_0 3 "s_register_operand" "w,w")]
4108+
VCMLA)))
41084109
]
41094110
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
41104111
"@
4111-
vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
4112-
vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
4112+
vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
4113+
vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
41134114
[(set_attr "type" "mve_move")
41144115
])
41154116

gcc/config/arm/neon.md

+19
Original file line numberDiff line numberDiff line change
@@ -2952,6 +2952,25 @@
29522952
[(set_attr "type" "neon_fcmla")]
29532953
)
29542954

2955+
;; The complex mul operations always need to expand to two instructions.
2956+
;; The first operation does half the computation and the second does the
2957+
;; remainder. Because of this, expand early.
2958+
(define_expand "cmul<conj_op><mode>3"
2959+
[(set (match_operand:VDF 0 "register_operand")
2960+
(unspec:VDF [(match_operand:VDF 1 "register_operand")
2961+
(match_operand:VDF 2 "register_operand")]
2962+
VCMUL_OP))]
2963+
"TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
2964+
{
2965+
rtx res1 = gen_reg_rtx (<MODE>mode);
2966+
rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
2967+
emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
2968+
operands[2], operands[1]));
2969+
emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
2970+
operands[2], operands[1]));
2971+
DONE;
2972+
})
2973+
29552974

29562975
;; These instructions map to the __builtins for the Dot Product operations.
29572976
(define_insn "neon_<sup>dot<vsi2qi>"

gcc/config/arm/unspecs.md

+3
Original file line numberDiff line numberDiff line change
@@ -510,10 +510,13 @@
510510
UNSPEC_VCMLA90
511511
UNSPEC_VCMLA180
512512
UNSPEC_VCMLA270
513+
UNSPEC_VCMLA_CONJ
514+
UNSPEC_VCMLA180_CONJ
513515
UNSPEC_VCMUL
514516
UNSPEC_VCMUL90
515517
UNSPEC_VCMUL180
516518
UNSPEC_VCMUL270
519+
UNSPEC_VCMUL_CONJ
517520
UNSPEC_MATMUL_S
518521
UNSPEC_MATMUL_U
519522
UNSPEC_MATMUL_US

gcc/config/arm/vec-common.md

+57
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,63 @@
215215
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
216216
)
217217

218+
;; The complex mul operations always need to expand to two instructions.
219+
;; The first operation does half the computation and the second does the
220+
;; remainder. Because of this, expand early.
221+
(define_expand "cmul<conj_op><mode>3"
222+
[(set (match_operand:VQ_HSF 0 "register_operand")
223+
(unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
224+
(match_operand:VQ_HSF 2 "register_operand")]
225+
VCMUL_OP))]
226+
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT))
227+
&& !BYTES_BIG_ENDIAN"
228+
{
229+
rtx res1 = gen_reg_rtx (<MODE>mode);
230+
if (TARGET_COMPLEX)
231+
{
232+
rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
233+
emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, tmp,
234+
operands[2], operands[1]));
235+
}
236+
else
237+
emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, CONST0_RTX (<MODE>mode),
238+
operands[2], operands[1]));
239+
240+
emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], res1,
241+
operands[2], operands[1]));
242+
DONE;
243+
})
244+
245+
(define_expand "arm_vcmla<rot><mode>"
246+
[(set (match_operand:VF 0 "register_operand")
247+
(plus:VF (match_operand:VF 1 "register_operand")
248+
(unspec:VF [(match_operand:VF 2 "register_operand")
249+
(match_operand:VF 3 "register_operand")]
250+
VCMLA)))]
251+
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
252+
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
253+
)
254+
255+
;; The complex mla/mls operations always need to expand to two instructions.
256+
;; The first operation does half the computation and the second does the
257+
;; remainder. Because of this, expand early.
258+
(define_expand "cml<fcmac1><conj_op><mode>4"
259+
[(set (match_operand:VF 0 "register_operand")
260+
(plus:VF (match_operand:VF 1 "register_operand")
261+
(unspec:VF [(match_operand:VF 2 "register_operand")
262+
(match_operand:VF 3 "register_operand")]
263+
VCMLA_OP)))]
264+
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
265+
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
266+
{
267+
rtx tmp = gen_reg_rtx (<MODE>mode);
268+
emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
269+
operands[3], operands[2]));
270+
emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
271+
operands[3], operands[2]));
272+
DONE;
273+
})
274+
218275
(define_expand "movmisalign<mode>"
219276
[(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
220277
(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]

0 commit comments

Comments
 (0)