Skip to content

Commit 4536b96

Browse files
authored
[NativeAOT/ARM64] Generate frames compatible with Apple compact unwinding (#107766)
* JIT/ARM64: Add ability to generate frames compatible with Apple compact unwinding format. For NativeAOT/ARM64/Apple API do the following: - Save callee registers in opposite order and in pairs. - Prefer saving FP/LR on the top of the frame. Heuristics are used to avoid worse code quality outside of prolog/epilog due to addressing range limits of the ARM64 instruction set. - Added optimization to lvaFrameAddress to rewrite FP-x references to SP+y when possible. This allows efficient addressing using positive indexes when FP points to the top of the frame. It mimics similar optimization on ARM32. * ObjWriter: For Mach-O ARM64 try to convert the DWARF CFI unwinding codes into compact unwinding code * Disable lvaFrameAddress FP->SP optimization for OSR methods
1 parent 9bef28b commit 4536b96

File tree

7 files changed

+329
-86
lines changed

7 files changed

+329
-86
lines changed

src/coreclr/jit/codegen.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,7 @@ class CodeGen final : public CodeGenInterface
659659
virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const;
660660
bool genSaveFpLrWithAllCalleeSavedRegisters;
661661
bool genForceFuncletFrameType5;
662+
bool genReverseAndPairCalleeSavedRegisters;
662663
#endif // TARGET_ARM64
663664

664665
//-------------------------------------------------------------------------

src/coreclr/jit/codegenarm64.cpp

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -845,12 +845,19 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i
845845

846846
for (int i = 0; i < regStack.Height(); ++i)
847847
{
848-
RegPair regPair = regStack.Bottom(i);
848+
RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Top(i) : regStack.Bottom(i);
849849
if (regPair.reg2 != REG_NA)
850850
{
851851
// We can use a STP instruction.
852-
genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0,
853-
nullptr);
852+
if (genReverseAndPairCalleeSavedRegisters)
853+
{
854+
genPrologSaveRegPair(regPair.reg2, regPair.reg1, spOffset, spDelta, false, REG_IP0, nullptr);
855+
}
856+
else
857+
{
858+
genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0,
859+
nullptr);
860+
}
854861

855862
spOffset += 2 * slotSize;
856863
}
@@ -926,8 +933,9 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe
926933

927934
// Save integer registers at higher addresses than floating-point registers.
928935

936+
regMaskTP maskSaveRegsFrame = regsToSaveMask & (RBM_FP | RBM_LR);
929937
regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT;
930-
regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat;
938+
regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat & ~maskSaveRegsFrame;
931939

932940
if (maskSaveRegsFloat != RBM_NONE)
933941
{
@@ -939,6 +947,13 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe
939947
if (maskSaveRegsInt != RBM_NONE)
940948
{
941949
genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, spDelta, lowestCalleeSavedOffset);
950+
spDelta = 0;
951+
lowestCalleeSavedOffset += genCountBits(maskSaveRegsInt) * FPSAVE_REGSIZE_BYTES;
952+
}
953+
954+
if (maskSaveRegsFrame != RBM_NONE)
955+
{
956+
genPrologSaveRegPair(REG_FP, REG_LR, lowestCalleeSavedOffset, spDelta, false, REG_IP0, nullptr);
942957
// No need to update spDelta, lowestCalleeSavedOffset since they're not used after this.
943958
}
944959
}
@@ -970,13 +985,20 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta
970985
stackDelta = spDelta;
971986
}
972987

973-
RegPair regPair = regStack.Top(i);
988+
RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Bottom(i) : regStack.Top(i);
974989
if (regPair.reg2 != REG_NA)
975990
{
976991
spOffset -= 2 * slotSize;
977992

978-
genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, REG_IP1,
979-
nullptr);
993+
if (genReverseAndPairCalleeSavedRegisters)
994+
{
995+
genEpilogRestoreRegPair(regPair.reg2, regPair.reg1, spOffset, stackDelta, false, REG_IP1, nullptr);
996+
}
997+
else
998+
{
999+
genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair,
1000+
REG_IP1, nullptr);
1001+
}
9801002
}
9811003
else
9821004
{
@@ -1043,11 +1065,19 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
10431065

10441066
// Save integer registers at higher addresses than floating-point registers.
10451067

1068+
regMaskTP maskRestoreRegsFrame = regsToRestoreMask & (RBM_FP | RBM_LR);
10461069
regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT;
1047-
regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat;
1070+
regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat & ~maskRestoreRegsFrame;
10481071

10491072
// Restore in the opposite order of saving.
10501073

1074+
if (maskRestoreRegsFrame != RBM_NONE)
1075+
{
1076+
int spFrameDelta = (maskRestoreRegsFloat != RBM_NONE || maskRestoreRegsInt != RBM_NONE) ? 0 : spDelta;
1077+
spOffset -= 2 * REGSIZE_BYTES;
1078+
genEpilogRestoreRegPair(REG_FP, REG_LR, spOffset, spFrameDelta, false, REG_IP1, nullptr);
1079+
}
1080+
10511081
if (maskRestoreRegsInt != RBM_NONE)
10521082
{
10531083
int spIntDelta = (maskRestoreRegsFloat != RBM_NONE) ? 0 : spDelta; // should we delay the SP adjustment?

src/coreclr/jit/codegencommon.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ CodeGen::CodeGen(Compiler* theCompiler)
255255
#ifdef TARGET_ARM64
256256
genSaveFpLrWithAllCalleeSavedRegisters = false;
257257
genForceFuncletFrameType5 = false;
258+
genReverseAndPairCalleeSavedRegisters = false;
258259
#endif // TARGET_ARM64
259260
}
260261

@@ -4827,6 +4828,29 @@ void CodeGen::genFinalizeFrame()
48274828
}
48284829
#endif // TARGET_ARM
48294830

4831+
#ifdef TARGET_ARM64
4832+
if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform)
4833+
{
4834+
JITDUMP("Setting genReverseAndPairCalleeSavedRegisters = true");
4835+
4836+
genReverseAndPairCalleeSavedRegisters = true;
4837+
4838+
// Make sure we push the registers in pairs if possible. If we only allocate a contiguous
4839+
// block of registers this should add at most one integer and at most one floating point
4840+
// register to the list. The stack has to be 16-byte aligned, so in worst case it results
4841+
// in allocating 16 bytes more space on stack if odd number of integer and odd number of
4842+
// FP registers were occupied. Same number of instructions will be generated, just the
4843+
// STR instructions are replaced with STP (store pair).
4844+
regMaskTP maskModifiedRegs = regSet.rsGetModifiedRegsMask();
4845+
regMaskTP maskPairRegs = ((maskModifiedRegs & (RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14)).getLow() << 1) |
4846+
((maskModifiedRegs & (RBM_R19 | RBM_R21 | RBM_R23 | RBM_R25 | RBM_R27)).getLow() << 1);
4847+
if (maskPairRegs != RBM_NONE)
4848+
{
4849+
regSet.rsSetRegsModified(maskPairRegs);
4850+
}
4851+
}
4852+
#endif
4853+
48304854
#ifdef DEBUG
48314855
if (verbose)
48324856
{

src/coreclr/jit/compiler.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2808,6 +2808,16 @@ inline
28082808
{
28092809
*pBaseReg = REG_SPBASE;
28102810
}
2811+
#elif defined(TARGET_ARM64)
2812+
if (FPbased && !codeGen->isFramePointerRequired() && varOffset < 0 && !opts.IsOSR() &&
2813+
lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && codeGen->IsSaveFpLrWithAllCalleeSavedRegisters())
2814+
{
2815+
int spVarOffset = varOffset + codeGen->genSPtoFPdelta();
2816+
JITDUMP("lvaFrameAddress optimization for V%02u: [FP-%d] -> [SP+%d]\n", varNum, -varOffset, spVarOffset);
2817+
FPbased = false;
2818+
varOffset = spVarOffset;
2819+
}
2820+
*pFPbased = FPbased;
28112821
#else
28122822
*pFPbased = FPbased;
28132823
#endif

src/coreclr/jit/lclvars.cpp

Lines changed: 85 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -5644,7 +5644,9 @@ void Compiler::lvaFixVirtualFrameOffsets()
56445644
#endif
56455645

56465646
// The delta to be added to virtual offset to adjust it relative to frame pointer or SP
5647-
int delta = 0;
5647+
int delta = 0;
5648+
int frameLocalsDelta = 0;
5649+
int frameBoundary = 0;
56485650

56495651
#ifdef TARGET_XARCH
56505652
delta += REGSIZE_BYTES; // pushed PC (return address) for x86/x64
@@ -5669,7 +5671,25 @@ void Compiler::lvaFixVirtualFrameOffsets()
56695671
// We set FP to be after LR, FP
56705672
delta += 2 * REGSIZE_BYTES;
56715673
}
5672-
#elif defined(TARGET_AMD64) || defined(TARGET_ARM64)
5674+
#elif defined(TARGET_ARM64)
5675+
else
5676+
{
5677+
// FP is used.
5678+
delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta();
5679+
5680+
// If we placed FP/LR at the bottom of the frame we need to shift all the variables
5681+
// on the new frame to account for it. See lvaAssignVirtualFrameOffsetsToLocals.
5682+
if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters())
5683+
{
5684+
// We set FP to be after LR, FP
5685+
frameLocalsDelta = 2 * REGSIZE_BYTES;
5686+
frameBoundary = opts.IsOSR() ? -info.compPatchpointInfo->TotalFrameSize() : 0;
5687+
if (info.compIsVarArgs)
5688+
frameBoundary -= MAX_REG_ARG * REGSIZE_BYTES;
5689+
}
5690+
JITDUMP("--- delta bump %d for FP frame, %d inside frame for FP/LR relocation\n", delta, frameLocalsDelta);
5691+
}
5692+
#elif defined(TARGET_AMD64)
56735693
else
56745694
{
56755695
// FP is used.
@@ -5737,7 +5757,7 @@ void Compiler::lvaFixVirtualFrameOffsets()
57375757

57385758
#if defined(TARGET_X86)
57395759
// On x86, we set the stack offset for a promoted field
5740-
// to match a struct parameter in lvAssignFrameOffsetsToPromotedStructs.
5760+
// to match a struct parameter in lvaAssignFrameOffsetsToPromotedStructs.
57415761
if ((!varDsc->lvIsParam || parentvarDsc->lvIsParam) && promotionType == PROMOTION_TYPE_DEPENDENT)
57425762
#else
57435763
if (!varDsc->lvIsParam && promotionType == PROMOTION_TYPE_DEPENDENT)
@@ -5757,15 +5777,23 @@ void Compiler::lvaFixVirtualFrameOffsets()
57575777

57585778
if (doAssignStkOffs)
57595779
{
5760-
JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), varDsc->GetStackOffset() + delta);
5761-
varDsc->SetStackOffset(varDsc->GetStackOffset() + delta);
5780+
int localDelta = delta;
5781+
5782+
if (frameLocalsDelta != 0 && varDsc->GetStackOffset() < frameBoundary)
5783+
{
5784+
localDelta += frameLocalsDelta;
5785+
}
5786+
5787+
JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(),
5788+
varDsc->GetStackOffset() + localDelta);
5789+
varDsc->SetStackOffset(varDsc->GetStackOffset() + localDelta);
57625790

57635791
#if DOUBLE_ALIGN
57645792
if (genDoubleAlign() && !codeGen->isFramePointerUsed())
57655793
{
57665794
if (varDsc->lvFramePointerBased)
57675795
{
5768-
varDsc->SetStackOffset(varDsc->GetStackOffset() - delta);
5796+
varDsc->SetStackOffset(varDsc->GetStackOffset() - localDelta);
57695797

57705798
// We need to re-adjust the offsets of the parameters so they are EBP
57715799
// relative rather than stack/frame pointer relative
@@ -5787,9 +5815,13 @@ void Compiler::lvaFixVirtualFrameOffsets()
57875815
assert(codeGen->regSet.tmpAllFree());
57885816
for (TempDsc* temp = codeGen->regSet.tmpListBeg(); temp != nullptr; temp = codeGen->regSet.tmpListNxt(temp))
57895817
{
5790-
temp->tdAdjustTempOffs(delta);
5818+
temp->tdAdjustTempOffs(delta + frameLocalsDelta);
57915819
}
57925820

5821+
if (lvaCachedGenericContextArgOffs < frameBoundary)
5822+
{
5823+
lvaCachedGenericContextArgOffs += frameLocalsDelta;
5824+
}
57935825
lvaCachedGenericContextArgOffs += delta;
57945826

57955827
#if FEATURE_FIXED_OUT_ARGS
@@ -6045,30 +6077,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
60456077
codeGen->setFramePointerUsed(codeGen->isFramePointerRequired());
60466078
}
60476079

6048-
#ifdef TARGET_ARM64
6049-
// Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is
6050-
// a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we
6051-
// need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value,
6052-
// and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the
6053-
// frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress.
6054-
// (It should be legal to use these frame types for every frame).
6055-
6056-
if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0)
6057-
{
6058-
// Default configuration
6059-
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) ||
6060-
opts.compDbgEnC || compStressCompile(STRESS_GENERIC_VARN, 20));
6061-
}
6062-
else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1)
6063-
{
6064-
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames
6065-
}
6066-
else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3))
6067-
{
6068-
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames
6069-
}
6070-
#endif // TARGET_ARM64
6071-
60726080
#ifdef TARGET_XARCH
60736081
// On x86/amd64, the return address has already been pushed by the call instruction in the caller.
60746082
stkOffs -= TARGET_POINTER_SIZE; // return address;
@@ -6117,9 +6125,13 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
61176125
#endif // !TARGET_ARM
61186126

61196127
#ifdef TARGET_ARM64
6120-
// If the frame pointer is used, then we'll save FP/LR at the bottom of the stack.
6121-
// Otherwise, we won't store FP, and we'll store LR at the top, with the other callee-save
6122-
// registers (if any).
6128+
// If the frame pointer is used, then we'll save FP/LR either at the bottom of the stack
6129+
// or at the top of the stack depending on frame type. We make the decision after assigning
6130+
// the variables on the frame and then fix up the offsets in lvaFixVirtualFrameOffsets.
6131+
// For now, we proceed as if FP/LR were saved with the callee registers. If we later
6132+
// decide to move the FP/LR to the bottom of the frame it shifts all the assigned
6133+
// variables and temporaries by 16 bytes. The largest alignment we currently make is 16
6134+
// bytes for SIMD.
61236135

61246136
int initialStkOffs = 0;
61256137
if (info.compIsVarArgs)
@@ -6130,17 +6142,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
61306142
stkOffs -= initialStkOffs;
61316143
}
61326144

6133-
if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() || !isFramePointerUsed()) // Note that currently we always have
6134-
// a frame pointer
6135-
{
6136-
stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES;
6137-
}
6138-
else
6139-
{
6140-
// Subtract off FP and LR.
6141-
assert(compCalleeRegsPushed >= 2);
6142-
stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES;
6143-
}
6145+
stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES;
61446146

61456147
#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
61466148

@@ -6810,15 +6812,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
68106812
}
68116813
#endif // TARGET_AMD64
68126814

6813-
#ifdef TARGET_ARM64
6814-
if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always have
6815-
// a frame pointer
6816-
{
6817-
// Create space for saving FP and LR.
6818-
stkOffs -= 2 * REGSIZE_BYTES;
6819-
}
6820-
#endif // TARGET_ARM64
6821-
68226815
#if FEATURE_FIXED_OUT_ARGS
68236816
if (lvaOutgoingArgSpaceSize > 0)
68246817
{
@@ -6856,6 +6849,44 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
68566849

68576850
noway_assert(compLclFrameSize + originalFrameSize ==
68586851
(unsigned)-(stkOffs + (pushedCount * (int)TARGET_POINTER_SIZE)));
6852+
6853+
#ifdef TARGET_ARM64
6854+
// Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is
6855+
// a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we
6856+
// need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value,
6857+
// and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the
6858+
// frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress.
6859+
// (It should be legal to use these frame types for every frame).
6860+
//
6861+
// For Apple NativeAOT ABI we try to save the FP/LR registers on top to get canonical frame layout that can
6862+
// be represented with compact unwinding information. In order to maintain code quality we only do it when
6863+
// we can use SP-based addressing (!isFramePointerRequired) through lvaFrameAddress optimization, or if the
6864+
// whole frame is small enough that the negative FP-based addressing can address the whole frame.
6865+
6866+
if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0)
6867+
{
6868+
if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform &&
6869+
(!codeGen->isFramePointerRequired() || codeGen->genTotalFrameSize() < 0x100))
6870+
{
6871+
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true);
6872+
}
6873+
else
6874+
{
6875+
// Default configuration
6876+
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) ||
6877+
opts.compDbgEnC ||
6878+
compStressCompile(Compiler::STRESS_GENERIC_VARN, 20));
6879+
}
6880+
}
6881+
else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1)
6882+
{
6883+
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames
6884+
}
6885+
else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3))
6886+
{
6887+
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames
6888+
}
6889+
#endif // TARGET_ARM64
68596890
}
68606891

68616892
//------------------------------------------------------------------------

src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,5 +120,18 @@ internal static class MachNative
120120
public const uint PLATFORM_TVOSSIMULATOR = 8;
121121
public const uint PLATFORM_WATCHOSSIMULATOR = 9;
122122
public const uint PLATFORM_DRIVERKIT = 10;
123+
124+
public const uint UNWIND_ARM64_MODE_FRAMELESS = 0x02000000;
125+
public const uint UNWIND_ARM64_MODE_DWARF = 0x03000000;
126+
public const uint UNWIND_ARM64_MODE_FRAME = 0x04000000;
127+
public const uint UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001;
128+
public const uint UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002;
129+
public const uint UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004;
130+
public const uint UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008;
131+
public const uint UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010;
132+
public const uint UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100;
133+
public const uint UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200;
134+
public const uint UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400;
135+
public const uint UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800;
123136
}
124137
}

0 commit comments

Comments
 (0)