Skip to content

Commit df9385b

Browse files
committed
JIT ARM64-SVE: Add AddAcross
1 parent 7745b5e commit df9385b

File tree

11 files changed

+731
-110
lines changed

11 files changed

+731
-110
lines changed

src/coreclr/jit/codegenarm64test.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5314,11 +5314,11 @@ void CodeGen::genArm64EmitterUnitTestsSve()
53145314
#endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED
53155315

53165316
// IF_SVE_AI_3A
5317-
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_1BYTE, REG_V1, REG_P4, REG_V2,
5317+
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_SCALABLE, REG_V1, REG_P4, REG_V2,
53185318
INS_OPTS_SCALABLE_B); // SADDV <Dd>, <Pg>, <Zn>.<T>
5319-
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_2BYTE, REG_V2, REG_P5, REG_V3,
5319+
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_SCALABLE, REG_V2, REG_P5, REG_V3,
53205320
INS_OPTS_SCALABLE_H); // SADDV <Dd>, <Pg>, <Zn>.<T>
5321-
theEmitter->emitIns_R_R_R(INS_sve_uaddv, EA_4BYTE, REG_V3, REG_P6, REG_V4,
5321+
theEmitter->emitIns_R_R_R(INS_sve_uaddv, EA_SCALABLE, REG_V3, REG_P6, REG_V4,
53225322
INS_OPTS_SCALABLE_S); // UADDV <Dd>, <Pg>, <Zn>.<T>
53235323

53245324
// IF_SVE_AJ_3A
@@ -6768,15 +6768,15 @@ void CodeGen::genArm64EmitterUnitTestsSve()
67686768
#endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED
67696769

67706770
// IF_SVE_HE_3A
6771-
theEmitter->emitIns_R_R_R(INS_sve_faddv, EA_2BYTE, REG_V21, REG_P7, REG_V7,
6771+
theEmitter->emitIns_R_R_R(INS_sve_faddv, EA_SCALABLE, REG_V21, REG_P7, REG_V7,
67726772
INS_OPTS_SCALABLE_H); // FADDV <V><d>, <Pg>, <Zn>.<T>
6773-
theEmitter->emitIns_R_R_R(INS_sve_fmaxnmv, EA_2BYTE, REG_V22, REG_P6, REG_V6,
6773+
theEmitter->emitIns_R_R_R(INS_sve_fmaxnmv, EA_SCALABLE, REG_V22, REG_P6, REG_V6,
67746774
INS_OPTS_SCALABLE_H); // FMAXNMV <V><d>, <Pg>, <Zn>.<T>
6775-
theEmitter->emitIns_R_R_R(INS_sve_fmaxv, EA_4BYTE, REG_V23, REG_P5, REG_V5,
6775+
theEmitter->emitIns_R_R_R(INS_sve_fmaxv, EA_SCALABLE, REG_V23, REG_P5, REG_V5,
67766776
INS_OPTS_SCALABLE_S); // FMAXV <V><d>, <Pg>, <Zn>.<T>
6777-
theEmitter->emitIns_R_R_R(INS_sve_fminnmv, EA_8BYTE, REG_V24, REG_P4, REG_V4,
6777+
theEmitter->emitIns_R_R_R(INS_sve_fminnmv, EA_SCALABLE, REG_V24, REG_P4, REG_V4,
67786778
INS_OPTS_SCALABLE_D); // FMINNMV <V><d>, <Pg>, <Zn>.<T>
6779-
theEmitter->emitIns_R_R_R(INS_sve_fminv, EA_4BYTE, REG_V25, REG_P3, REG_V3,
6779+
theEmitter->emitIns_R_R_R(INS_sve_fminv, EA_SCALABLE, REG_V25, REG_P3, REG_V3,
67806780
INS_OPTS_SCALABLE_S); // FMINV <V><d>, <Pg>, <Zn>.<T>
67816781

67826782
// IF_SVE_HQ_3A

src/coreclr/jit/emitarm64sve.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3060,7 +3060,6 @@ void emitter::emitInsSve_R_R_R(instruction ins,
30603060
break;
30613061

30623062
case INS_sve_saddv:
3063-
case INS_sve_uaddv:
30643063
assert(isFloatReg(reg1));
30653064
assert(isLowPredicateRegister(reg2));
30663065
assert(isVectorRegister(reg3));
@@ -3069,6 +3068,15 @@ void emitter::emitInsSve_R_R_R(instruction ins,
30693068
fmt = IF_SVE_AI_3A;
30703069
break;
30713070

3071+
case INS_sve_uaddv:
3072+
assert(isFloatReg(reg1));
3073+
assert(isLowPredicateRegister(reg2));
3074+
assert(isVectorRegister(reg3));
3075+
assert(insOptsScalableStandard(opt));
3076+
assert(insScalableOptsNone(sopt));
3077+
fmt = IF_SVE_AI_3A;
3078+
break;
3079+
30723080
case INS_sve_addqv:
30733081
unreached(); // TODO-SVE: Not yet supported.
30743082
assert(isVectorRegister(reg1));
@@ -4059,7 +4067,7 @@ void emitter::emitInsSve_R_R_R(instruction ins,
40594067
assert(isLowPredicateRegister(reg2));
40604068
assert(isVectorRegister(reg3));
40614069
assert(insOptsScalableFloat(opt));
4062-
assert(isValidVectorElemsizeSveFloat(size));
4070+
assert(isScalableVectorSize(size));
40634071
assert(insScalableOptsNone(sopt));
40644072
fmt = IF_SVE_HE_3A;
40654073
break;
@@ -4069,7 +4077,7 @@ void emitter::emitInsSve_R_R_R(instruction ins,
40694077
assert(isLowPredicateRegister(reg2));
40704078
assert(isVectorRegister(reg3));
40714079
assert(insOptsScalableFloat(opt));
4072-
assert(isValidVectorElemsizeSveFloat(size));
4080+
assert(isScalableVectorSize(size));
40734081
assert(insScalableOptsNone(sopt));
40744082
fmt = IF_SVE_HJ_3A;
40754083
break;
@@ -12618,7 +12626,7 @@ void emitter::emitInsSveSanityCheck(instrDesc* id)
1261812626
assert(isVectorRegister(id->idReg1())); // ddddd
1261912627
assert(isLowPredicateRegister(id->idReg2())); // ggg
1262012628
assert(isVectorRegister(id->idReg3())); // mmmmm
12621-
assert(isValidVectorElemsizeSveFloat(id->idOpSize()));
12629+
assert(isScalableVectorSize(id->idOpSize()));
1262212630
break;
1262312631

1262412632
// Scalable to general register.
@@ -13211,11 +13219,20 @@ void emitter::emitInsSveSanityCheck(instrDesc* id)
1321113219

1321213220
// Scalable, widening to scalar SIMD.
1321313221
case IF_SVE_AI_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer add reduction (predicated)
13214-
assert(insOptsScalableWide(id->idInsOpt())); // xx
13222+
switch (id->idIns())
13223+
{
13224+
case INS_sve_saddv:
13225+
assert(insOptsScalableWide(id->idInsOpt())); // xx
13226+
break;
13227+
13228+
default:
13229+
assert(insOptsScalableStandard(id->idInsOpt())); // xx
13230+
break;
13231+
}
1321513232
assert(isVectorRegister(id->idReg1())); // ddddd
1321613233
assert(isLowPredicateRegister(id->idReg2())); // ggg
1321713234
assert(isVectorRegister(id->idReg3())); // mmmmm
13218-
assert(isValidVectorElemsizeWidening(id->idOpSize()));
13235+
assert(isScalableVectorSize(id->idOpSize()));
1321913236
break;
1322013237

1322113238
// Scalable, possibly FP.

src/coreclr/jit/hwintrinsic.h

Lines changed: 73 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -70,175 +70,176 @@ enum HWIntrinsicCategory : uint8_t
7070
#else
7171
#error Unsupported platform
7272
#endif
73+
7374
enum HWIntrinsicFlag : unsigned int
7475
{
7576
HW_Flag_NoFlag = 0,
7677

7778
// Commutative
7879
// - if a binary-op intrinsic is commutative (e.g., Add, Multiply), its op1 can be contained
79-
HW_Flag_Commutative = 0x1,
80+
HW_Flag_Commutative = (unsigned int)(1 << 1),
8081

8182
// NoCodeGen
8283
// - should be transformed in the compiler front-end, cannot reach CodeGen
83-
HW_Flag_NoCodeGen = 0x2,
84+
HW_Flag_NoCodeGen = (1 << 2),
8485

8586
// Multi-instruction
8687
// - that one intrinsic can generate multiple instructions
87-
HW_Flag_MultiIns = 0x4,
88+
HW_Flag_MultiIns = (1 << 3),
8889

8990
// Select base type using the first argument type
90-
HW_Flag_BaseTypeFromFirstArg = 0x8,
91+
HW_Flag_BaseTypeFromFirstArg = (1 << 4),
9192

9293
// Select base type using the second argument type
93-
HW_Flag_BaseTypeFromSecondArg = 0x10,
94+
HW_Flag_BaseTypeFromSecondArg = (1 << 5),
9495

9596
// Indicates compFloatingPointUsed does not need to be set.
96-
HW_Flag_NoFloatingPointUsed = 0x20,
97+
HW_Flag_NoFloatingPointUsed = (1 << 6),
9798

9899
// NoJmpTable IMM
99100
// the imm intrinsic does not need jumptable fallback when it gets non-const argument
100-
HW_Flag_NoJmpTableIMM = 0x40,
101+
HW_Flag_NoJmpTableIMM = (1 << 7),
101102

102103
// Special codegen
103104
// the intrinsics need special rules in CodeGen,
104105
// but may be table-driven in the front-end
105-
HW_Flag_SpecialCodeGen = 0x80,
106+
HW_Flag_SpecialCodeGen = (1 << 8),
106107

107108
// Special import
108109
// the intrinsics need special rules in importer,
109110
// but may be table-driven in the back-end
110-
HW_Flag_SpecialImport = 0x100,
111+
HW_Flag_SpecialImport = (1 << 9),
111112

112113
// The intrinsic returns result in multiple registers.
113-
HW_Flag_MultiReg = 0x200,
114+
HW_Flag_MultiReg = (1 << 10),
115+
116+
// The intrinsic has some barrier special side effect that should be tracked
117+
HW_Flag_SpecialSideEffect_Barrier = (1 << 11),
118+
119+
// The intrinsic has some other special side effect that should be tracked
120+
HW_Flag_SpecialSideEffect_Other = (1 << 12),
121+
122+
HW_Flag_SpecialSideEffectMask = (HW_Flag_SpecialSideEffect_Barrier | HW_Flag_SpecialSideEffect_Other),
114123

115-
// The below is for defining platform-specific flags
124+
// MaybeNoJmpTable IMM
125+
// the imm intrinsic may not need jumptable fallback when it gets non-const argument
126+
HW_Flag_MaybeNoJmpTableIMM = (1 << 13),
127+
128+
HW_Flag_CanBenefitFromConstantProp = (1 << 14),
129+
130+
// Used as a base for shifting the platform specific flags.
131+
HW_Flag_PlatformBase = 14,
132+
#define HW_TARGET_FLAG(id) (unsigned int)(1 << (id + HW_Flag_PlatformBase))
133+
134+
// Platform-specific flags
116135
#if defined(TARGET_XARCH)
117136
// Full range IMM intrinsic
118137
// - the immediate value is valid on the full range of imm8 (0-255)
119-
HW_Flag_FullRangeIMM = 0x400,
138+
HW_Flag_FullRangeIMM = HW_TARGET_FLAG(1),
120139

121140
// Maybe IMM
122141
// the intrinsic has either imm or Vector overloads
123-
HW_Flag_MaybeIMM = 0x800,
142+
HW_Flag_MaybeIMM = HW_TARGET_FLAG(2),
124143

125144
// Copy Upper bits
126145
// some SIMD scalar intrinsics need the semantics of copying upper bits from the source operand
127-
HW_Flag_CopyUpperBits = 0x1000,
146+
HW_Flag_CopyUpperBits = HW_TARGET_FLAG(3),
128147

129148
// Maybe Memory Load/Store
130149
// - some intrinsics may have pointer overloads but without HW_Category_MemoryLoad/HW_Category_MemoryStore
131-
HW_Flag_MaybeMemoryLoad = 0x2000,
132-
HW_Flag_MaybeMemoryStore = 0x4000,
150+
HW_Flag_MaybeMemoryLoad = HW_TARGET_FLAG(4),
151+
HW_Flag_MaybeMemoryStore = HW_TARGET_FLAG(5),
133152

134153
// No Read/Modify/Write Semantics
135154
// the intrinsic doesn't have read/modify/write semantics in two/three-operand form.
136-
HW_Flag_NoRMWSemantics = 0x8000,
155+
HW_Flag_NoRMWSemantics = HW_TARGET_FLAG(6),
137156

138157
// NoContainment
139158
// the intrinsic cannot be handled by containment,
140159
// all the intrinsic that have explicit memory load/store semantics should have this flag
141-
HW_Flag_NoContainment = 0x10000,
160+
HW_Flag_NoContainment = HW_TARGET_FLAG(7),
142161

143162
// Returns Per-Element Mask
144163
// the intrinsic returns a vector containing elements that are either "all bits set" or "all bits clear"
145164
// this output can be used as a per-element mask
146-
HW_Flag_ReturnsPerElementMask = 0x20000,
165+
HW_Flag_ReturnsPerElementMask = HW_TARGET_FLAG(8),
147166

148167
// AvxOnlyCompatible
149168
// the intrinsic can be used on hardware with AVX but not AVX2 support
150-
HW_Flag_AvxOnlyCompatible = 0x40000,
169+
HW_Flag_AvxOnlyCompatible = HW_TARGET_FLAG(9),
151170

152171
// MaybeCommutative
153172
// - if a binary-op intrinsic is maybe commutative (e.g., Max or Min for float/double), its op1 can possibly be
154173
// contained
155-
HW_Flag_MaybeCommutative = 0x80000,
174+
HW_Flag_MaybeCommutative = HW_TARGET_FLAG(10),
156175

157176
// The intrinsic has no EVEX compatible form
158-
HW_Flag_NoEvexSemantics = 0x100000,
177+
HW_Flag_NoEvexSemantics = HW_TARGET_FLAG(11),
178+
179+
// The intrinsic is an RMW intrinsic
180+
HW_Flag_RmwIntrinsic = HW_TARGET_FLAG(12),
181+
182+
// The intrinsic is a FusedMultiplyAdd intrinsic
183+
HW_Flag_FmaIntrinsic = HW_TARGET_FLAG(13),
184+
185+
// The intrinsic is a PermuteVar2x intrinsic
186+
HW_Flag_PermuteVar2x = HW_TARGET_FLAG(14),
187+
188+
// The intrinsic is an embedded broadcast compatible intrinsic
189+
HW_Flag_EmbBroadcastCompatible = HW_TARGET_FLAG(15),
190+
191+
// The intrinsic is an embedded rounding compatible intrinsic
192+
HW_Flag_EmbRoundingCompatible = HW_TARGET_FLAG(16),
193+
194+
// The intrinsic is an embedded masking incompatible intrinsic
195+
HW_Flag_EmbMaskingIncompatible = HW_TARGET_FLAG(17),
159196

160197
#elif defined(TARGET_ARM64)
161198
// The intrinsic has an immediate operand
162199
// - the value can be (and should be) encoded in a corresponding instruction when the operand value is constant
163-
HW_Flag_HasImmediateOperand = 0x400,
200+
HW_Flag_HasImmediateOperand = HW_TARGET_FLAG(1),
164201

165202
// The intrinsic has read/modify/write semantics in multiple-operands form.
166-
HW_Flag_HasRMWSemantics = 0x800,
203+
HW_Flag_HasRMWSemantics = HW_TARGET_FLAG(2),
167204

168205
// The intrinsic operates on the lower part of a SIMD register
169206
// - the upper part of the source registers are ignored
170207
// - the upper part of the destination register is zeroed
171-
HW_Flag_SIMDScalar = 0x1000,
208+
HW_Flag_SIMDScalar = HW_TARGET_FLAG(3),
172209

173210
// The intrinsic supports some sort of containment analysis
174-
HW_Flag_SupportsContainment = 0x2000,
211+
HW_Flag_SupportsContainment = HW_TARGET_FLAG(4),
175212

176213
// The intrinsic needs consecutive registers
177-
HW_Flag_NeedsConsecutiveRegisters = 0x4000,
214+
HW_Flag_NeedsConsecutiveRegisters = HW_TARGET_FLAG(5),
178215

179216
// The intrinsic uses scalable registers
180-
HW_Flag_Scalable = 0x8000,
217+
HW_Flag_Scalable = HW_TARGET_FLAG(6),
181218

182219
// Returns Per-Element Mask
183220
// the intrinsic returns a vector containing elements that are either "all bits set" or "all bits clear"
184221
// this output can be used as a per-element mask
185-
HW_Flag_ReturnsPerElementMask = 0x10000,
222+
HW_Flag_ReturnsPerElementMask = HW_TARGET_FLAG(7),
186223

187224
// The intrinsic uses a mask in arg1 to select elements present in the result
188-
HW_Flag_ExplicitMaskedOperation = 0x20000,
225+
HW_Flag_ExplicitMaskedOperation = HW_TARGET_FLAG(8),
189226

190227
// The intrinsic uses a mask in arg1 to select elements present in the result, and must use a low register.
191-
HW_Flag_LowMaskedOperation = 0x40000,
228+
HW_Flag_LowMaskedOperation = HW_TARGET_FLAG(9),
192229

193230
// The intrinsic can optionally use a mask in arg1 to select elements present in the result, which is not present in
194231
// the API call
195-
HW_Flag_OptionalEmbeddedMaskedOperation = 0x80000,
232+
HW_Flag_OptionalEmbeddedMaskedOperation = HW_TARGET_FLAG(10),
196233

197234
// The intrinsic uses a mask in arg1 to select elements present in the result, which is not present in the API call
198-
HW_Flag_EmbeddedMaskedOperation = 0x100000,
235+
HW_Flag_EmbeddedMaskedOperation = HW_TARGET_FLAG(11),
236+
237+
// The intrinsic has an enum operand. Using this implies HW_Flag_HasImmediateOperand.
238+
HW_Flag_HasEnumOperand = HW_TARGET_FLAG(12),
199239

200240
#else
201241
#error Unsupported platform
202242
#endif
203-
204-
// The intrinsic has some barrier special side effect that should be tracked
205-
HW_Flag_SpecialSideEffect_Barrier = 0x200000,
206-
207-
// The intrinsic has some other special side effect that should be tracked
208-
HW_Flag_SpecialSideEffect_Other = 0x400000,
209-
210-
HW_Flag_SpecialSideEffectMask = (HW_Flag_SpecialSideEffect_Barrier | HW_Flag_SpecialSideEffect_Other),
211-
212-
// MaybeNoJmpTable IMM
213-
// the imm intrinsic may not need jumptable fallback when it gets non-const argument
214-
HW_Flag_MaybeNoJmpTableIMM = 0x800000,
215-
216-
#if defined(TARGET_XARCH)
217-
// The intrinsic is an RMW intrinsic
218-
HW_Flag_RmwIntrinsic = 0x1000000,
219-
220-
// The intrinsic is a FusedMultiplyAdd intrinsic
221-
HW_Flag_FmaIntrinsic = 0x2000000,
222-
223-
// The intrinsic is a PermuteVar2x intrinsic
224-
HW_Flag_PermuteVar2x = 0x4000000,
225-
226-
// The intrinsic is an embedded broadcast compatible intrinsic
227-
HW_Flag_EmbBroadcastCompatible = 0x8000000,
228-
229-
// The intrinsic is an embedded rounding compatible intrinsic
230-
HW_Flag_EmbRoundingCompatible = 0x10000000,
231-
232-
// The intrinsic is an embedded masking incompatible intrinsic
233-
HW_Flag_EmbMaskingIncompatible = 0x20000000,
234-
#elif defined(TARGET_ARM64)
235-
236-
// The intrinsic has an enum operand. Using this implies HW_Flag_HasImmediateOperand.
237-
HW_Flag_HasEnumOperand = 0x1000000,
238-
239-
#endif // TARGET_XARCH
240-
241-
HW_Flag_CanBenefitFromConstantProp = 0x80000000,
242243
};
243244

244245
#if defined(TARGET_XARCH)

src/coreclr/jit/hwintrinsiclistarm64sve.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
// Sve
2020
HARDWARE_INTRINSIC(Sve, Abs, -1, -1, false, {INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
2121
HARDWARE_INTRINSIC(Sve, Add, -1, -1, false, {INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_fadd, INS_sve_fadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
22+
HARDWARE_INTRINSIC(Sve, AddAcross, -1, 1, true, {INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_faddv, INS_sve_faddv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation)
2223
HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, true, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment)
2324
HARDWARE_INTRINSIC(Sve, Count16BitElements, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_cnth, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasEnumOperand|HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed)
2425
HARDWARE_INTRINSIC(Sve, Count32BitElements, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_cntw, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasEnumOperand|HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed)

0 commit comments

Comments
 (0)