Skip to content

Commit bcb4a74

Browse files
committed
Implement Arm64 Stack Probe in src/coreclr/jit/codegenarmarch.cpp src/coreclr/jit/lclvars.cpp src/coreclr/jit/target.h
1 parent 138faab commit bcb4a74

File tree

3 files changed

+87
-99
lines changed

3 files changed

+87
-99
lines changed

src/coreclr/jit/codegenarmarch.cpp

Lines changed: 78 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -3821,77 +3821,7 @@ void CodeGen::genPushCalleeSavedRegisters()
38213821

38223822
int totalFrameSize = genTotalFrameSize();
38233823

3824-
bool useStackProbeHelper = false;
3825-
const int pageSize = (int)compiler->eeGetPageSize();
3826-
3827-
const int currentSpToFinalSp = compiler->compLclFrameSize;
3828-
3829-
if (currentSpToFinalSp < compiler->getVeryLargeFrameSize())
3830-
{
3831-
const regNumber tempReg = REG_SCRATCH;
3832-
bool tempRegWasModified = false;
3833-
3834-
constexpr int ldrLargestPositiveImmByteOffset = 0x8000;
3835-
const bool useLdrUnsignedImmediate = (pageSize < ldrLargestPositiveImmByteOffset / 2);
3836-
3837-
int currentSpToLastProbedLoc = 0;
3838-
3839-
if (useLdrUnsignedImmediate)
3840-
{
3841-
for (int currentSpToTempReg = 0;
3842-
currentSpToFinalSp + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES - currentSpToLastProbedLoc > pageSize;)
3843-
{
3844-
const int currentSpToProbeLoc = min(currentSpToLastProbedLoc + pageSize, currentSpToFinalSp);
3845-
3846-
if (currentSpToProbeLoc > currentSpToTempReg)
3847-
{
3848-
if (currentSpToFinalSp + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES - currentSpToProbeLoc > pageSize)
3849-
{
3850-
// At least one more probing beside the one at [sp, #currentSpToProbeLoc] are needed,
3851-
// so it is worthwhile to advance tempReg and emit two or more ldr xzr, [tempReg, #imm].
3852-
currentSpToTempReg = currentSpToTempReg + ldrLargestPositiveImmByteOffset;
3853-
GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, tempReg, REG_SPBASE, currentSpToTempReg);
3854-
tempRegWasModified = true;
3855-
}
3856-
else
3857-
{
3858-
break;
3859-
}
3860-
}
3861-
3862-
const int probeLocToTempReg = currentSpToTempReg - currentSpToProbeLoc;
3863-
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_ZR, tempReg, probeLocToTempReg);
3864-
currentSpToLastProbedLoc = currentSpToProbeLoc;
3865-
}
3866-
}
3867-
3868-
while (currentSpToFinalSp + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES - currentSpToLastProbedLoc > pageSize)
3869-
{
3870-
const int currentSpToProbeLoc = min(currentSpToLastProbedLoc + pageSize, currentSpToFinalSp);
3871-
3872-
// Emit mov tempReg, #imm followed by ldr wzr, [sp, tempReg].
3873-
genSetRegToIcon(tempReg, -currentSpToProbeLoc, TYP_I_IMPL);
3874-
tempRegWasModified = true;
3875-
GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, tempReg);
3876-
currentSpToLastProbedLoc = currentSpToProbeLoc;
3877-
}
3878-
3879-
if (tempRegWasModified)
3880-
{
3881-
regSet.verifyRegUsed(tempReg);
3882-
3883-
if (initReg == tempReg)
3884-
{
3885-
*pInitRegZeroed = false;
3886-
}
3887-
}
3888-
3889-
compiler->unwindPadding();
3890-
}
3891-
else
3892-
{
3893-
useStackProbeHelper = true;
3894-
}
3824+
const int probePageSize = (int)compiler->eeGetPageSize();
38953825

38963826
int offset; // This will be the starting place for saving the callee-saved registers, in increasing order.
38973827

@@ -4177,6 +4107,51 @@ void CodeGen::genPushCalleeSavedRegisters()
41774107
// If we do establish the frame pointer, what is the amount we add to SP to do so?
41784108
unsigned offsetSpToSavedFp = 0;
41794109

4110+
auto emitUnrolledStackProbeLoop = [this, probePageSize, initReg, pInitRegZeroed](int currentSpToFpLrLoc,
4111+
int lastProbedLocToCurrentSp) {
4112+
// We can not call a stack probe helper before storing lr register on the stack
4113+
// since the call would trash that register.
4114+
// Instead for relatively small frames (smaller than STACK_PROBE_HELPER_FRAME_SIZE_PAGES)
4115+
// the JIT emits unrolled stack probing loop.
4116+
// "stp fp, lr, [sp, #fpLrLoc]" would also count as a probe, hence we use and maintain
4117+
// the value of currentSpToFpLrLoc in the algorithm below.
4118+
4119+
assert(currentSpToFpLrLoc < STACK_PROBE_HELPER_FRAME_SIZE_PAGES * probePageSize);
4120+
4121+
// Generate the following code
4122+
//
4123+
// sub sp, sp, #probePageSize
4124+
// ldr xzr, [sp,#probeLocToCurrentSp]
4125+
// ...
4126+
// sub sp, sp, #probePageSize
4127+
// ldr xzr, [sp,#probeLocToCurrentSp]
4128+
//
4129+
// until sp is closer than probePageSize to a location
4130+
// where fp,lr register pair will be written.
4131+
4132+
int lastProbedLocToFpLrLoc = lastProbedLocToCurrentSp + currentSpToFpLrLoc;
4133+
4134+
while (currentSpToFpLrLoc > probePageSize)
4135+
{
4136+
const int probeLocToFpLrLoc = lastProbedLocToFpLrLoc - probePageSize;
4137+
4138+
genStackPointerAdjustment(-probePageSize, initReg, pInitRegZeroed, /* reportUnwindData */ true);
4139+
currentSpToFpLrLoc -= probePageSize;
4140+
4141+
const int probeLocToCurrentSp = probeLocToFpLrLoc - currentSpToFpLrLoc;
4142+
4143+
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, probeLocToCurrentSp);
4144+
compiler->unwindNop();
4145+
4146+
lastProbedLocToFpLrLoc = probeLocToFpLrLoc;
4147+
}
4148+
4149+
// The loop doesn't have to stop at a location where fp,lr register pair will be written.
4150+
// Therefore, we need to return the distance between updated sp value and that location to
4151+
// a user of this lambda.
4152+
return currentSpToFpLrLoc;
4153+
};
4154+
41804155
if (frameType == 1)
41814156
{
41824157
assert(!genSaveFpLrWithAllCalleeSavedRegisters);
@@ -4197,6 +4172,13 @@ void CodeGen::genPushCalleeSavedRegisters()
41974172
assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component --
41984173
// totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned.
41994174

4175+
int lastProbedLocToCurrentSp = STACK_PROBE_BOUNDARY_THRESHOLD_BYTES;
4176+
4177+
if (compiler->info.compIsVarArgs || ((maskSaveRegsInt | maskSaveRegsFloat) != 0))
4178+
{
4179+
lastProbedLocToCurrentSp = 0;
4180+
}
4181+
42004182
if (compiler->lvaOutgoingArgSpaceSize > 504)
42014183
{
42024184
// We can't do "stp fp,lr,[sp,#outsz]" because #outsz is too big.
@@ -4209,7 +4191,11 @@ void CodeGen::genPushCalleeSavedRegisters()
42094191

42104192
JITDUMP(" spAdjustment2=%d\n", spAdjustment2);
42114193

4212-
genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -spAdjustment2, false, initReg, pInitRegZeroed);
4194+
int currentSpToFpLrLoc = spAdjustment2;
4195+
currentSpToFpLrLoc = emitUnrolledStackProbeLoop(currentSpToFpLrLoc, lastProbedLocToCurrentSp);
4196+
4197+
genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -currentSpToFpLrLoc, false, initReg,
4198+
pInitRegZeroed);
42134199
offset += spAdjustment2;
42144200

42154201
// Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub"
@@ -4227,17 +4213,28 @@ void CodeGen::genPushCalleeSavedRegisters()
42274213

42284214
JITDUMP(" spAdjustment3=%d\n", spAdjustment3);
42294215

4230-
// We've already established the frame pointer, so no need to report the stack pointer change to unwind
4231-
// info.
4232-
genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed, /* reportUnwindData */ false);
4216+
if (spAdjustment3 >= probePageSize)
4217+
{
4218+
genEmitStackProbeHelperCall(spAdjustment3, initReg, pInitRegZeroed);
4219+
}
4220+
else
4221+
{
4222+
// We've already established the frame pointer, so no need to report the stack pointer change to unwind
4223+
// info.
4224+
genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed, /* reportUnwindData */ false);
4225+
}
42334226
offset += spAdjustment3;
42344227
}
42354228
else
42364229
{
4237-
genPrologSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, -remainingFrameSz, false, initReg,
4230+
int currentSpToFpLrLoc = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize;
4231+
currentSpToFpLrLoc = emitUnrolledStackProbeLoop(currentSpToFpLrLoc, lastProbedLocToCurrentSp);
4232+
4233+
const int currentSpToFinalSp = currentSpToFpLrLoc + compiler->lvaOutgoingArgSpaceSize;
4234+
genPrologSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, -currentSpToFinalSp, false, initReg,
42384235
pInitRegZeroed);
4239-
offset += remainingFrameSz;
42404236

4237+
offset += remainingFrameSz;
42414238
offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize;
42424239
}
42434240
}
@@ -4266,26 +4263,14 @@ void CodeGen::genPushCalleeSavedRegisters()
42664263

42674264
JITDUMP(" remainingFrameSz=%d\n", remainingFrameSz);
42684265

4269-
// We've already established the frame pointer, so no need to report the unwind info at this point.
4270-
const bool reportUnwindData = false;
4271-
4272-
if (useStackProbeHelper)
4266+
if (remainingFrameSz >= probePageSize)
42734267
{
4274-
genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, remainingFrameSz,
4275-
REG_STACK_PROBE_HELPER_ARG, reportUnwindData);
4276-
regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);
4277-
genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN, REG_STACK_PROBE_HELPER_CALL_TARGET);
4278-
GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
4279-
4280-
if ((genRegMask(initReg) & (RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET |
4281-
RBM_STACK_PROBE_HELPER_TRASH)) != RBM_NONE)
4282-
{
4283-
*pInitRegZeroed = false;
4284-
}
4268+
genEmitStackProbeHelperCall(remainingFrameSz, initReg, pInitRegZeroed);
42854269
}
42864270
else
42874271
{
4288-
genStackPointerAdjustment(-remainingFrameSz, initReg, pInitRegZeroed, reportUnwindData);
4272+
// We've already established the frame pointer, so no need to report the unwind info at this point.
4273+
genStackPointerAdjustment(-remainingFrameSz, initReg, pInitRegZeroed, /* reportUnwindData */ false);
42894274
}
42904275

42914276
offset += remainingFrameSz;

src/coreclr/jit/lclvars.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5998,7 +5998,9 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
59985998
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames
59995999
}
60006000

6001-
if (compLclFrameSize >= getVeryLargeFrameSize())
6001+
const unsigned probePageSize = (unsigned)eeGetPageSize();
6002+
6003+
if (compLclFrameSize >= STACK_PROBE_HELPER_FRAME_SIZE_PAGES * probePageSize)
60026004
{
60036005
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true);
60046006
}

src/coreclr/jit/target.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,11 +1552,12 @@ typedef unsigned char regNumberSmall;
15521552
// For arm64, this is the maximum prolog establishment pre-indexed (that is SP pre-decrement) offset.
15531553
#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 512
15541554

1555-
#define REG_STACK_PROBE_HELPER_ARG REG_R9
1556-
#define RBM_STACK_PROBE_HELPER_ARG RBM_R9
1557-
#define REG_STACK_PROBE_HELPER_CALL_TARGET REG_IP0
1558-
#define RBM_STACK_PROBE_HELPER_CALL_TARGET RBM_IP0
1559-
#define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE
1555+
#define STACK_PROBE_HELPER_FRAME_SIZE_PAGES 4
1556+
#define REG_STACK_PROBE_HELPER_ARG REG_R9
1557+
#define RBM_STACK_PROBE_HELPER_ARG RBM_R9
1558+
#define REG_STACK_PROBE_HELPER_CALL_TARGET REG_IP0
1559+
#define RBM_STACK_PROBE_HELPER_CALL_TARGET RBM_IP0
1560+
#define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE
15601561

15611562
// Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. "MLA (by element)")
15621563
// have encoding that restricts what registers that can be used for the indexed element when the element size is H (i.e. 2 bytes).

0 commit comments

Comments
 (0)