Skip to content

Commit 85c411e

Browse files
authored
[NativeAOT] Enable async runtime suspension and return hijacking on unix-arm64 (#73216)
* Enable async runtime suspension and return hijacking on unix-arm64 * fix unix-x64 build * new way of epilog detection * actually wait for GC * Removed REVIEW comment as we now have a tracking issue
1 parent 3238c2a commit 85c411e

11 files changed

+322
-63
lines changed

src/coreclr/jit/codegenarm64.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog)
221221
case 2:
222222
{
223223
// Generate:
224-
// ldr fp,lr,[sp,#outsz]
224+
// ldp fp,lr,[sp,#outsz]
225225
// add sp,sp,#framesz
226226

227227
GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE,

src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,7 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PInvokeTransitionF
220220
m_RegDisplay.pFP = (PTR_UIntNative)PTR_HOST_MEMBER(PInvokeTransitionFrame, pFrame, m_FramePointer);
221221
m_RegDisplay.pLR = (PTR_UIntNative)PTR_HOST_MEMBER(PInvokeTransitionFrame, pFrame, m_RIP);
222222

223-
ASSERT(!(pFrame->m_Flags & PTFF_SAVE_FP)); // FP should never contain a GC ref because we require
224-
// a frame pointer for methods with pinvokes
223+
ASSERT(!(pFrame->m_Flags & PTFF_SAVE_FP)); // FP should never contain a GC ref
225224

226225
if (pFrame->m_Flags & PTFF_SAVE_X19) { m_RegDisplay.pX19 = pPreservedRegsCursor++; }
227226
if (pFrame->m_Flags & PTFF_SAVE_X20) { m_RegDisplay.pX20 = pPreservedRegsCursor++; }
@@ -303,9 +302,6 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PInvokeTransitionF
303302

304303
#endif // defined(USE_PORTABLE_HELPERS)
305304

306-
// @TODO: currently, we always save all registers -- how do we handle the onese we don't save once we
307-
// start only saving those that weren't already saved?
308-
309305
// This function guarantees that the final initialized context will refer to a managed
310306
// frame. In the rare case where the PC does not refer to managed code (and refers to an
311307
// assembly thunk instead), unwind through the thunk sequence to find the nearest managed

src/coreclr/nativeaot/Runtime/amd64/GcProbe.S

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,16 @@ NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler
158158
POP_COOP_PINVOKE_FRAME
159159
ret
160160
NESTED_END RhpGcPollRare, _TEXT
161+
162+
163+
#ifdef FEATURE_GC_STRESS
164+
165+
//
166+
// GC Stress Hijack targets
167+
//
168+
LEAF_ENTRY RhpGcStressHijack, _TEXT
169+
// NYI
170+
int 3
171+
LEAF_END RhpGcStressHijack, _TEXT
172+
173+
#endif // FEATURE_GC_STRESS

src/coreclr/nativeaot/Runtime/arm64/GcProbe.S

Lines changed: 174 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,177 @@
44
#include <unixasmmacros.inc>
55
#include "AsmOffsets.inc"
66

7-
.global C_FUNC(RhpGcPoll2)
8-
9-
LEAF_ENTRY RhpGcPoll
10-
PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 0
11-
cbnz w0, C_FUNC(RhpGcPollRare) // TrapThreadsFlags_None = 0
12-
ret
13-
LEAF_END RhpGcPoll
14-
15-
NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler
16-
PUSH_COOP_PINVOKE_FRAME x0
17-
bl C_FUNC(RhpGcPoll2)
18-
POP_COOP_PINVOKE_FRAME
19-
ret
20-
NESTED_END RhpGcPollRare
7+
PROBE_FRAME_SIZE = 0xD0 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) +
8+
// 10 * 8 for callee saved registers +
9+
// 1 * 8 for caller SP +
10+
// 2 * 8 for int returns +
11+
// 1 * 8 for alignment padding +
12+
// 4 * 16 for FP/HFA/HVA returns
13+
14+
// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers
15+
// and accepts the register bitmask
16+
// Call this macro first in the method (no further prolog instructions can be added after this).
17+
//
18+
// threadReg : register containing the Thread* (this will be preserved).
19+
// trashReg : register that can be trashed by this macro
20+
// BITMASK : value to initialize m_dwFlags field with (register or #constant)
21+
.macro PUSH_PROBE_FRAME threadReg, trashReg, BITMASK
22+
23+
// Define the method prolog, allocating enough stack space for the PInvokeTransitionFrame and saving
24+
// incoming register values into it.
25+
26+
// First create PInvokeTransitionFrame
27+
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -PROBE_FRAME_SIZE // Push down stack pointer and store FP and LR
28+
29+
// Slot at [sp, #0x10] is reserved for Thread *
30+
// Slot at [sp, #0x18] is reserved for bitmask of saved registers
31+
32+
// Save callee saved registers
33+
PROLOG_SAVE_REG_PAIR x19, x20, 0x20
34+
PROLOG_SAVE_REG_PAIR x21, x22, 0x30
35+
PROLOG_SAVE_REG_PAIR x23, x24, 0x40
36+
PROLOG_SAVE_REG_PAIR x25, x26, 0x50
37+
PROLOG_SAVE_REG_PAIR x27, x28, 0x60
38+
39+
// Slot at [sp, #0x70] is reserved for caller sp
40+
41+
// Save the integer return registers
42+
stp x0, x1, [sp, #0x78]
43+
44+
// Slot at [sp, #0x88] is alignment padding
45+
46+
// Save the FP/HFA/HVA return registers
47+
stp q0, q1, [sp, #0x90]
48+
stp q2, q3, [sp, #0xB0]
49+
50+
// Perform the rest of the PInvokeTransitionFrame initialization.
51+
// str \threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] // Thread * (unused by stackwalker)
52+
// str \BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] // save the register bitmask passed in by caller
53+
stp \threadReg, \BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread]
54+
55+
add \trashReg, sp, #PROBE_FRAME_SIZE // recover value of caller's SP
56+
str \trashReg, [sp, #0x70] // save caller's SP
57+
58+
// link the frame into the Thread
59+
mov \trashReg, sp
60+
str \trashReg, [\threadReg, #OFFSETOF__Thread__m_pDeferredTransitionFrame]
61+
.endm
62+
63+
//
64+
// Remove the frame from a previous call to PUSH_PROBE_FRAME from the top of the stack and restore preserved
65+
// registers and return value to their values from before the probe was called (while also updating any
66+
// object refs or byrefs).
67+
//
68+
.macro POP_PROBE_FRAME
69+
70+
// Restore the integer return registers
71+
ldp x0, x1, [sp, #0x78]
72+
73+
// Restore the FP/HFA/HVA return registers
74+
ldp q0, q1, [sp, #0x90]
75+
ldp q2, q3, [sp, #0xB0]
76+
77+
// Restore callee saved registers
78+
EPILOG_RESTORE_REG_PAIR x19, x20, 0x20
79+
EPILOG_RESTORE_REG_PAIR x21, x22, 0x30
80+
EPILOG_RESTORE_REG_PAIR x23, x24, 0x40
81+
EPILOG_RESTORE_REG_PAIR x25, x26, 0x50
82+
EPILOG_RESTORE_REG_PAIR x27, x28, 0x60
83+
84+
EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, PROBE_FRAME_SIZE
85+
.endm
86+
87+
//
88+
// The prolog for all GC suspension hijacks (normal and stress). Fixes up the hijacked return address, and
89+
// clears the hijack state.
90+
//
91+
// Register state on entry:
92+
// All registers correct for return to the original return address.
93+
//
94+
// Register state on exit:
95+
// x2: thread pointer
96+
// x12: transition frame flags for the return registers x0 and x1
97+
//
98+
.macro FixupHijackedCallstack
99+
100+
// x2 <- GetThread()
101+
INLINE_GETTHREAD x2
102+
103+
//
104+
// Fix the stack by restoring the original return address
105+
//
106+
// Load m_pvHijackedReturnAddress and m_uHijackedReturnValueFlags
107+
ldp lr, x12, [x2, #OFFSETOF__Thread__m_pvHijackedReturnAddress]
108+
109+
//
110+
// Clear hijack state
111+
//
112+
// Clear m_ppvHijackedReturnAddressLocation and m_pvHijackedReturnAddress
113+
stp xzr, xzr, [x2, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation]
114+
// Clear m_uHijackedReturnValueFlags
115+
str xzr, [x2, #OFFSETOF__Thread__m_uHijackedReturnValueFlags]
116+
117+
.endm
118+
119+
//
120+
// GC Probe Hijack target
121+
//
122+
NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler
123+
FixupHijackedCallstack
124+
125+
PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 3
126+
tbnz x3, #TrapThreadsFlags_TrapThreads_Bit, WaitForGC
127+
ret
128+
129+
WaitForGC:
130+
orr x12, x12, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1
131+
b C_FUNC(RhpWaitForGC)
132+
NESTED_END RhpGcProbeHijack
133+
134+
.global C_FUNC(RhpThrowHwEx)
135+
136+
NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler
137+
PUSH_PROBE_FRAME x2, x3, x12
138+
139+
ldr x0, [x2, #OFFSETOF__Thread__m_pDeferredTransitionFrame]
140+
bl C_FUNC(RhpWaitForGC2)
141+
142+
ldr x2, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags]
143+
tbnz x2, #PTFF_THREAD_ABORT_BIT, ThrowThreadAbort
144+
145+
POP_PROBE_FRAME
146+
EPILOG_RETURN
147+
ThrowThreadAbort:
148+
POP_PROBE_FRAME
149+
mov w0, #STATUS_REDHAWK_THREAD_ABORT
150+
mov x1, lr // return address as exception PC
151+
b C_FUNC(RhpThrowHwEx)
152+
NESTED_END RhpWaitForGC
153+
154+
.global C_FUNC(RhpGcPoll2)
155+
156+
LEAF_ENTRY RhpGcPoll
157+
PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 0
158+
cbnz w0, C_FUNC(RhpGcPollRare) // TrapThreadsFlags_None = 0
159+
ret
160+
LEAF_END RhpGcPoll
161+
162+
NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler
163+
PUSH_COOP_PINVOKE_FRAME x0
164+
bl C_FUNC(RhpGcPoll2)
165+
POP_COOP_PINVOKE_FRAME
166+
ret
167+
NESTED_END RhpGcPollRare
168+
169+
170+
#ifdef FEATURE_GC_STRESS
171+
172+
//
173+
// GC Stress Hijack targets
174+
//
175+
LEAF_ENTRY RhpGcStressHijack, _TEXT
176+
// NYI
177+
EMIT_BREAKPOINT
178+
LEAF_END RhpGcStressHijack, _TEXT
179+
180+
#endif // FEATURE_GC_STRESS

src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
m_CallersSP field 8 ; SP at routine entry
1717
field 2 * 8 ; x0..x1
1818
field 8 ; alignment padding
19-
field 4 * 8 ; d0..d3
19+
field 4 * 16; q0..q3
2020
PROBE_FRAME_SIZE field 0
2121

2222
;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers
@@ -48,18 +48,20 @@ PROBE_FRAME_SIZE field 0
4848
;; Slot at [sp, #0x70] is reserved for caller sp
4949

5050
;; Save the integer return registers
51-
PROLOG_NOP str x0, [sp, #0x78]
52-
PROLOG_NOP str x1, [sp, #0x80]
51+
PROLOG_NOP stp x0, x1, [sp, #0x78]
5352

5453
;; Slot at [sp, #0x88] is alignment padding
5554

56-
;; Save the floating return registers
57-
PROLOG_NOP stp d0, d1, [sp, #0x90]
58-
PROLOG_NOP stp d2, d3, [sp, #0xA0]
55+
;; Save the FP/HFA/HVA return registers
56+
PROLOG_NOP stp q0, q1, [sp, #0x90]
57+
PROLOG_NOP stp q2, q3, [sp, #0xB0]
5958

6059
;; Perform the rest of the PInvokeTransitionFrame initialization.
61-
str $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] ; save the register bitmask passed in by caller
62-
str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker)
60+
;; str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker)
61+
;; str $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] ; save the register bitmask passed in by caller
62+
ASSERT OFFSETOF__PInvokeTransitionFrame__m_Flags == (OFFSETOF__PInvokeTransitionFrame__m_pThread + 8)
63+
stp $threadReg, $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread]
64+
6365
add $trashReg, sp, #PROBE_FRAME_SIZE ; recover value of caller's SP
6466
str $trashReg, [sp, #m_CallersSP] ; save caller's SP
6567

@@ -77,12 +79,11 @@ PROBE_FRAME_SIZE field 0
7779
POP_PROBE_FRAME
7880

7981
;; Restore the integer return registers
80-
PROLOG_NOP ldr x0, [sp, #0x78]
81-
PROLOG_NOP ldr x1, [sp, #0x80]
82+
PROLOG_NOP ldp x0, x1, [sp, #0x78]
8283

83-
; Restore the floating return registers
84-
EPILOG_NOP ldp d0, d1, [sp, #0x90]
85-
EPILOG_NOP ldp d2, d3, [sp, #0xA0]
84+
; Restore the FP/HFA/HVA return registers
85+
EPILOG_NOP ldp q0, q1, [sp, #0x90]
86+
EPILOG_NOP ldp q2, q3, [sp, #0xB0]
8687

8788
;; Restore callee saved registers
8889
EPILOG_RESTORE_REG_PAIR x19, x20, #0x20
@@ -173,11 +174,11 @@ WaitForGC
173174
bl RhpWaitForGC2
174175

175176
ldr x2, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags]
176-
tbnz x2, #PTFF_THREAD_ABORT_BIT, %F1
177+
tbnz x2, #PTFF_THREAD_ABORT_BIT, ThrowThreadAbort
177178

178179
POP_PROBE_FRAME
179180
EPILOG_RETURN
180-
1
181+
ThrowThreadAbort
181182
POP_PROBE_FRAME
182183
EPILOG_NOP mov w0, #STATUS_REDHAWK_THREAD_ABORT
183184
EPILOG_NOP mov x1, lr ;; return address as exception PC

src/coreclr/nativeaot/Runtime/portable.cpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -401,25 +401,20 @@ EXTERN_C void * ReturnFromCallDescrThunk;
401401
void * ReturnFromCallDescrThunk;
402402
#endif
403403

404-
#if defined(USE_PORTABLE_HELPERS) || defined(TARGET_UNIX)
404+
#if defined(USE_PORTABLE_HELPERS)
405405
//
406406
// Return address hijacking
407407
//
408-
#if !defined (HOST_ARM64)
409408
COOP_PINVOKE_HELPER(void, RhpGcStressHijack, ())
410409
{
411410
ASSERT_UNCONDITIONALLY("NYI");
412411
}
413-
#else // !defined (HOST_ARM64)
412+
414413
COOP_PINVOKE_HELPER(void, RhpGcProbeHijack, ())
415414
{
416415
ASSERT_UNCONDITIONALLY("NYI");
417416
}
418-
COOP_PINVOKE_HELPER(void, RhpGcStressHijack, ())
419-
{
420-
ASSERT_UNCONDITIONALLY("NYI");
421-
}
422-
#endif // !defined (HOST_ARM64)
417+
423418
#endif // defined(USE_PORTABLE_HELPERS) || defined(TARGET_UNIX)
424419

425420
#if defined(USE_PORTABLE_HELPERS)

src/coreclr/nativeaot/Runtime/thread.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -587,12 +587,6 @@ void Thread::Hijack()
587587
return;
588588
}
589589

590-
#if defined(TARGET_ARM64) && defined(TARGET_UNIX)
591-
// TODO: RhpGcProbeHijack and related asm helpers NYI for ARM64/UNIX.
592-
// disabling hijacking for now.
593-
return;
594-
#endif
595-
596590
// PalHijack will call HijackCallback or make the target thread call it.
597591
// It may also do nothing if the target thread is in inconvenient state.
598592
PalHijack(m_hPalThread, this);
@@ -640,13 +634,15 @@ void Thread::HijackCallback(NATIVE_CONTEXT* pThreadContext, void* pThreadToHijac
640634
return;
641635
}
642636

643-
ICodeManager* codeManager = runtime->GetCodeManagerForAddress(pvAddress);
644-
645637
// we may be able to do GC stack walk right where the threads is now,
646-
// as long as it is on a GC safe point and if we can unwind the stack at that location.
647-
if (codeManager->IsSafePoint(pvAddress) &&
648-
codeManager->IsUnwindable(pvAddress))
638+
// as long as the location is a GC safe point.
639+
ICodeManager* codeManager = runtime->GetCodeManagerForAddress(pvAddress);
640+
if (codeManager->IsSafePoint(pvAddress))
649641
{
642+
// we may not be able to unwind in some locations, such as epilogs.
643+
// such locations should not contain safe points.
644+
ASSERT(codeManager->IsUnwindable(pvAddress));
645+
650646
// if we are not given a thread to hijack
651647
// perform in-line wait on the current thread
652648
if (pThreadToHijack == NULL)

src/coreclr/nativeaot/Runtime/threadstore.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent)
218218

219219
bool keepWaiting;
220220
YieldProcessorNormalizationInfo normalizationInfo;
221+
int waitCycles = 1;
221222
do
222223
{
223224
keepWaiting = false;
@@ -248,7 +249,13 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent)
248249
// @TODO: need tuning for spin
249250
// @TODO: need tuning for this whole loop as well.
250251
// we are likley too aggressive with interruptions which may result in longer pauses.
251-
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 10000);
252+
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, waitCycles);
253+
254+
// simplistic linear backoff for now
255+
// we could be catching threads in restartable sequences such as LL/SC style interlocked on ARM64
256+
// and forcing them to restart.
257+
// if interrupt mechanism is fast, eagerness could be hurting our overall progress.
258+
waitCycles += 10000;
252259
}
253260
}
254261

0 commit comments

Comments
 (0)