diff --git a/src/coreclr/inc/jithelpers.h b/src/coreclr/inc/jithelpers.h index 3f72fdbc8ccc4d..18724a9680a1f2 100644 --- a/src/coreclr/inc/jithelpers.h +++ b/src/coreclr/inc/jithelpers.h @@ -350,11 +350,7 @@ JITHELPER(CORINFO_HELP_GVMLOOKUP_FOR_SLOT, NULL, CORINFO_HELP_SIG_NO_ALIGN_STUB) -#ifndef TARGET_ARM64 JITHELPER(CORINFO_HELP_STACK_PROBE, JIT_StackProbe, CORINFO_HELP_SIG_REG_ONLY) -#else - JITHELPER(CORINFO_HELP_STACK_PROBE, NULL, CORINFO_HELP_SIG_UNDEF) -#endif JITHELPER(CORINFO_HELP_PATCHPOINT, JIT_Patchpoint, CORINFO_HELP_SIG_REG_ONLY) JITHELPER(CORINFO_HELP_CLASSPROFILE, JIT_ClassProfile, CORINFO_HELP_SIG_REG_ONLY) diff --git a/src/coreclr/inc/readytorunhelpers.h b/src/coreclr/inc/readytorunhelpers.h index ea3ea684b3626c..eb8f299ad9eb53 100644 --- a/src/coreclr/inc/readytorunhelpers.h +++ b/src/coreclr/inc/readytorunhelpers.h @@ -119,9 +119,7 @@ HELPER(READYTORUN_HELPER_ReversePInvokeExit, CORINFO_HELP_JIT_REVERSE_PIN HELPER(READYTORUN_HELPER_MonitorEnter, CORINFO_HELP_MON_ENTER, ) HELPER(READYTORUN_HELPER_MonitorExit, CORINFO_HELP_MON_EXIT, ) -#ifndef TARGET_ARM64 HELPER(READYTORUN_HELPER_StackProbe, CORINFO_HELP_STACK_PROBE, ) -#endif HELPER(READYTORUN_HELPER_GetCurrentManagedThreadId, CORINFO_HELP_GETCURRENTMANAGEDTHREADID, ) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 9deffd0883a5e5..b36ed4d7b2ac72 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -346,7 +346,11 @@ class CodeGen final : public CodeGenInterface void genPushCalleeSavedRegisters(); #endif - void genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn); + void genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed); + +#ifdef TARGET_ARMARCH + void genEmitStackProbeHelperCall(int currentSpToFinalSp, regNumber initReg, bool* pInitRegZeroed); +#endif #if defined(TARGET_ARM) diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index 6cc322a794a69d..ac0c1c3bcba32c 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -1827,12 +1827,11 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper) // initReg - register to use as a scratch register. // pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if // this call sets 'initReg' to a non-zero value. -// maskArgRegsLiveIn - incoming argument registers that are currently live. // // Return value: // None // -void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) +void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed) { assert(compiler->compGeneratingProlog); @@ -1849,41 +1848,9 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni { GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize); } - else if (frameSize < compiler->getVeryLargeFrameSize()) - { - for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) - { - // Generate: - // movw initReg, -probeOffset - // ldr initReg, [SP + initReg] - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)probeOffset); - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_PTRSIZE, initReg, REG_SPBASE, initReg); - } - - regSet.verifyRegUsed(initReg); - *pInitRegZeroed = false; // The initReg does not contain zero - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, frameSize); - compiler->unwindPadding(); - GetEmitter()->emitIns_R_R_R(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, initReg); - } else { - assert(frameSize >= compiler->getVeryLargeFrameSize()); - - genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, frameSize, - INS_FLAGS_DONT_CARE, REG_STACK_PROBE_HELPER_ARG); - regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG); - genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN, REG_STACK_PROBE_HELPER_CALL_TARGET); - compiler->unwindPadding(); - GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG); - - if ((genRegMask(initReg) & (RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET | - RBM_STACK_PROBE_HELPER_TRASH)) != RBM_NONE) - { - *pInitRegZeroed = false; - } + genEmitStackProbeHelperCall(frameSize, initReg, pInitRegZeroed); } compiler->unwindAllocStack(frameSize); @@ -1895,4 +1862,37 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni #endif // USING_SCOPE_INFO } +void CodeGen::genEmitStackProbeHelperCall(int currentSpToFinalSp, regNumber initReg, bool* pInitRegZeroed) +{ + // Generate the following code: + // + // movw r4, #currentSpToFinalSp + // sub r4, sp, r4 + // bl CORINFO_HELP_STACK_PROBE + // mov sp, r4 + // + // If frameSize can not be encoded by movw immediate this becomes: + // + // movw r4, #currentSpToFinalSpLo16 + // movt r4, #currentSpToFinalSpHi16 + // sub r4, sp, r4 + // bl CORINFO_HELP_STACK_PROBE + // mov sp, r4 + + genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, currentSpToFinalSp, + INS_FLAGS_DONT_CARE, REG_STACK_PROBE_HELPER_ARG); + regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG); + + genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN, REG_STACK_PROBE_HELPER_CALL_TARGET); + compiler->unwindPadding(); + + GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG); + + if ((genRegMask(initReg) & + (RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET | RBM_STACK_PROBE_HELPER_TRASH)) != RBM_NONE) + { + *pInitRegZeroed = false; + } +} + #endif // TARGET_ARM diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index dcd3e08a2284a3..464ed2bf0f9069 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -4904,6 +4904,25 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper) #endif // PROFILING_SUPPORTED +void CodeGen::genEmitStackProbeHelperCall(int currentSpToFinalSp, regNumber initReg, bool* pInitRegZeroed) +{ + assert(compiler->compGeneratingProlog); + + const bool reportUnwindData = false; + genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, currentSpToFinalSp, + REG_STACK_PROBE_HELPER_ARG, reportUnwindData); + regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG); + + genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN, REG_STACK_PROBE_HELPER_CALL_TARGET); + GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG); + + if ((genRegMask(initReg) & + (RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET | RBM_STACK_PROBE_HELPER_TRASH)) != RBM_NONE) + { + *pInitRegZeroed = false; + } +} + /***************************************************************************** * Unit testing of the ARM64 emitter: generate a bunch of instructions into the prolog * (it's as good a place as any), then use COMPlus_JitLateDisasm=* to see if the late @@ -9588,138 +9607,4 @@ void CodeGen::genArm64EmitterUnitTests() } #endif // defined(DEBUG) -//------------------------------------------------------------------------ -// genAllocLclFrame: Probe the stack. -// -// Notes: -// This only does the probing; allocating the frame is done when callee-saved registers are saved. -// This is done before anything has been pushed. The previous frame might have a large outgoing argument -// space that has been allocated, but the lowest addresses have not been touched. Our frame setup might -// not touch up to the first 504 bytes. This means we could miss a guard page. On Windows, however, -// there are always three guard pages, so we will not miss them all. On Linux, there is only one guard -// page by default, so we need to be more careful. We do an extra probe if we might not have probed -// recently enough. That is, if a call and prolog establishment might lead to missing a page. We do this -// on Windows as well just to be consistent, even though it should not be necessary. -// -// Arguments: -// frameSize - the size of the stack frame being allocated. -// initReg - register to use as a scratch register. -// pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if -// this call sets 'initReg' to a non-zero value. Otherwise, it is unchanged. -// maskArgRegsLiveIn - incoming argument registers that are currently live. -// -// Return value: -// None -// -void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) -{ - assert(compiler->compGeneratingProlog); - - if (frameSize == 0) - { - return; - } - - const target_size_t pageSize = compiler->eeGetPageSize(); - - // What offset from the final SP was the last probe? If we haven't probed almost a complete page, and - // if the next action on the stack might subtract from SP first, before touching the current SP, then - // we do one more probe at the very bottom. This can happen if we call a function on arm64 that does - // a "STP fp, lr, [sp-504]!", that is, pre-decrement SP then store. Note that we probe here for arm64, - // but we don't alter SP. - target_size_t lastTouchDelta = 0; - - assert(!compiler->info.compPublishStubParam || (REG_SECRET_STUB_PARAM != initReg)); - - if (frameSize < pageSize) - { - lastTouchDelta = frameSize; - } - else if (frameSize < compiler->getVeryLargeFrameSize()) - { - lastTouchDelta = frameSize; - - for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) - { - // Generate: - // movw initReg, -probeOffset - // ldr wzr, [sp + initReg] - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)probeOffset); - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, initReg); - regSet.verifyRegUsed(initReg); - *pInitRegZeroed = false; // The initReg does not contain zero - - lastTouchDelta -= pageSize; - } - - assert(lastTouchDelta == frameSize % pageSize); - compiler->unwindPadding(); - } - else - { - assert(frameSize >= compiler->getVeryLargeFrameSize()); - - // Emit the following sequence to 'tickle' the pages. Note it is important that stack pointer not change - // until this is complete since the tickles could cause a stack overflow, and we need to be able to crawl - // the stack afterward (which means the stack pointer needs to be known). - - regMaskTP availMask = RBM_ALLINT & (regSet.rsGetModifiedRegsMask() | ~RBM_INT_CALLEE_SAVED); - availMask &= ~maskArgRegsLiveIn; // Remove all of the incoming argument registers as they are currently live - availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg - - regNumber rOffset = initReg; - regNumber rLimit; - regMaskTP tempMask; - - // We pick the next lowest register number for rLimit - noway_assert(availMask != RBM_NONE); - tempMask = genFindLowestBit(availMask); - rLimit = genRegNumFromMask(tempMask); - - // Generate: - // - // mov rOffset, -pageSize // On arm, this turns out to be "movw r1, 0xf000; sxth r1, r1". - // // We could save 4 bytes in the prolog by using "movs r1, 0" at the - // // runtime expense of running a useless first loop iteration. - // mov rLimit, -frameSize - // loop: - // ldr wzr, [sp + rOffset] - // sub rOffset, pageSize - // cmp rLimit, rOffset - // b.ls loop // If rLimit is lower or same, we need to probe this rOffset. Note - // // especially that if it is the same, we haven't probed this page. - - noway_assert((ssize_t)(int)frameSize == (ssize_t)frameSize); // make sure framesize safely fits within an int - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, rOffset, -(ssize_t)pageSize); - instGen_Set_Reg_To_Imm(EA_PTRSIZE, rLimit, -(ssize_t)frameSize); - - // There's a "virtual" label here. But we can't create a label in the prolog, so we use the magic - // `emitIns_J` with a negative `instrCount` to branch back a specific number of instructions. - - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, rOffset); - GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, rOffset, rOffset, pageSize); - GetEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, rLimit, rOffset); // If equal, we need to probe again - GetEmitter()->emitIns_J(INS_bls, NULL, -4); - - *pInitRegZeroed = false; // The initReg does not contain zero - - compiler->unwindPadding(); - - lastTouchDelta = frameSize % pageSize; - } - - if (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize) - { - assert(lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES < 2 * pageSize); - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)frameSize); - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, initReg); - compiler->unwindPadding(); - - regSet.verifyRegUsed(initReg); - *pInitRegZeroed = false; // The initReg does not contain zero - } -} - #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index b6530172251426..c2db49980fe8b5 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3657,4 +3657,637 @@ void CodeGen::genSIMDSplitReturn(GenTree* src, ReturnTypeDesc* retTypeDesc) } #endif // FEATURE_SIMD +/*----------------------------------------------------------------------------- + * + * Push any callee-saved registers we have used + */ + +#if defined(TARGET_ARM64) +void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed) +#else +void CodeGen::genPushCalleeSavedRegisters() +#endif +{ + assert(compiler->compGeneratingProlog); + + regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; + +#if ETW_EBP_FRAMED + if (!isFramePointerUsed() && regSet.rsRegsModified(RBM_FPBASE)) + { + noway_assert(!"Used register RBM_FPBASE as a scratch register!"); + } +#endif + +#ifdef TARGET_ARMARCH + // On ARM we push the FP (frame-pointer) here along with all other callee saved registers + if (isFramePointerUsed()) + rsPushRegs |= RBM_FPBASE; + + // + // It may be possible to skip pushing/popping lr for leaf methods. However, such optimization would require + // changes in GC suspension architecture. + // + // We would need to guarantee that a tight loop calling a virtual leaf method can be suspended for GC. Today, we + // generate partially interruptible code for both the method that contains the tight loop with the call and the leaf + // method. GC suspension depends on return address hijacking in this case. Return address hijacking depends + // on the return address to be saved on the stack. If we skipped pushing/popping lr, the return address would never + // be saved on the stack and the GC suspension would time out. + // + // So if we wanted to skip pushing pushing/popping lr for leaf frames, we would also need to do one of + // the following to make GC suspension work in the above scenario: + // - Make return address hijacking work even when lr is not saved on the stack. + // - Generate fully interruptible code for loops that contains calls + // - Generate fully interruptible code for leaf methods + // + // Given the limited benefit from this optimization (<10k for CoreLib NGen image), the extra complexity + // is not worth it. + // + rsPushRegs |= RBM_LR; // We must save the return address (in the LR register) + + regSet.rsMaskCalleeSaved = rsPushRegs; +#endif // TARGET_ARMARCH + +#ifdef DEBUG + if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs)) + { + printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ", + compiler->compCalleeRegsPushed, genCountBits(rsPushRegs)); + dspRegMask(rsPushRegs); + printf("\n"); + assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs)); + } +#endif // DEBUG + +#if defined(TARGET_ARM) + regMaskTP maskPushRegsFloat = rsPushRegs & RBM_ALLFLOAT; + regMaskTP maskPushRegsInt = rsPushRegs & ~maskPushRegsFloat; + + maskPushRegsInt |= genStackAllocRegisterMask(compiler->compLclFrameSize, maskPushRegsFloat); + + assert(FitsIn(maskPushRegsInt)); + inst_IV(INS_push, (int)maskPushRegsInt); + compiler->unwindPushMaskInt(maskPushRegsInt); + + if (maskPushRegsFloat != 0) + { + genPushFltRegs(maskPushRegsFloat); + compiler->unwindPushMaskFloat(maskPushRegsFloat); + } +#elif defined(TARGET_ARM64) + // See the document "ARM64 JIT Frame Layout" and/or "ARM64 Exception Data" for more details or requirements and + // options. Case numbers in comments here refer to this document. See also Compiler::lvaAssignFrameOffsets() + // for pictures of the general frame layouts, and CodeGen::genFuncletProlog() implementations (per architecture) + // for pictures of the funclet frame layouts. + // + // For most frames, generate, e.g.: + // stp fp, lr, [sp,-0x80]! // predecrement SP with full frame size, and store FP/LR pair. + // stp r19, r20, [sp, 0x60] // store at positive offset from SP established above, into callee-saved area + // // at top of frame (highest addresses). + // stp r21, r22, [sp, 0x70] + // + // Notes: + // 1. We don't always need to save FP. If FP isn't saved, then LR is saved with the other callee-saved registers + // at the top of the frame. + // 2. If we save FP, then the first store is FP, LR. + // 3. General-purpose registers are 8 bytes, floating-point registers are 16 bytes, but FP/SIMD registers only + // preserve their lower 8 bytes, by calling convention. + // 4. For frames with varargs, we spill the integer register arguments to the stack, so all the arguments are + // consecutive, and at the top of the frame. + // 5. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc). + // + // For functions with GS and localloc, we change the frame so the frame pointer and LR are saved at the top + // of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same + // rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP. + // Since this frame type is relatively rare, we force using it via stress modes, for additional coverage. + // + // The frames look like the following (simplified to only include components that matter for establishing the + // frames). See also Compiler::lvaAssignFrameOffsets(). + // + // Frames with FP, LR saved at bottom of frame (above outgoing argument space): + // + // | | + // |-----------------------| + // | incoming arguments | + // +=======================+ <---- Caller's SP + // | Varargs regs space | // Only for varargs functions; 64 bytes + // |-----------------------| + // |Callee saved registers | // not including FP/LR; multiple of 8 bytes + // |-----------------------| + // | PSP slot | // 8 bytes (omitted in CoreRT ABI) + // |-----------------------| + // | locals, temps, etc. | + // |-----------------------| + // | possible GS cookie | + // |-----------------------| + // | Saved LR | // 8 bytes + // |-----------------------| + // | Saved FP | // 8 bytes + // |-----------------------| + // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + // |-----------------------| <---- Ambient SP + // | | | + // ~ | Stack grows ~ + // | | downward | + // V + // + // Frames with FP, LR saved at top of frame (below saved varargs incoming arguments): + // + // | | + // |-----------------------| + // | incoming arguments | + // +=======================+ <---- Caller's SP + // | Varargs regs space | // Only for varargs functions; 64 bytes + // |-----------------------| + // | Saved LR | // 8 bytes + // |-----------------------| + // | Saved FP | // 8 bytes + // |-----------------------| + // |Callee saved registers | // not including FP/LR; multiple of 8 bytes + // |-----------------------| + // | PSP slot | // 8 bytes (omitted in CoreRT ABI) + // |-----------------------| + // | locals, temps, etc. | + // |-----------------------| + // | possible GS cookie | + // |-----------------------| + // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + // |-----------------------| <---- Ambient SP + // | | | + // ~ | Stack grows ~ + // | | downward | + // V + // + + int totalFrameSize = genTotalFrameSize(); + + const int probePageSize = (int)compiler->eeGetPageSize(); + + int offset; // This will be the starting place for saving the callee-saved registers, in increasing order. + + regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT; + regMaskTP maskSaveRegsInt = rsPushRegs & ~maskSaveRegsFloat; + +#ifdef DEBUG + if (verbose) + { + printf("Save float regs: "); + dspRegMask(maskSaveRegsFloat); + printf("\n"); + printf("Save int regs: "); + dspRegMask(maskSaveRegsInt); + printf("\n"); + } +#endif // DEBUG + + // The frameType number is arbitrary, is defined below, and corresponds to one of the frame styles we + // generate based on various sizes. + int frameType = 0; + + // The amount to subtract from SP before starting to store the callee-saved registers. It might be folded into the + // first save instruction as a "predecrement" amount, if possible. + int calleeSaveSPDelta = 0; + + if (isFramePointerUsed()) + { + // We need to save both FP and LR. + + assert((maskSaveRegsInt & RBM_FP) != 0); + assert((maskSaveRegsInt & RBM_LR) != 0); + + // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address + // (FP and LR) are protected from buffer overrun by the GS cookie. If FP/LR are at the lowest addresses, + // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will + // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our + // saved FP/LR. In that case, we save FP/LR along with the rest of the callee-saved registers, above + // the GS cookie. + // + // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to + // create a frame pointer chain. + // + // Do we need another frame pointer register to get good code quality in the case of having the frame pointer + // point high in the frame, so we can take advantage of arm64's preference for positive offsets? C++ native + // code dedicates callee-saved x19 to this, so generates: + // mov x19, sp + // in the prolog, then uses x19 for local var accesses. Given that this case is so rare, we currently do + // not do this. That means that negative offsets from FP might need to use the reserved register to form + // the local variable offset for an addressing mode. + + if (((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize <= 504)) && + !genSaveFpLrWithAllCalleeSavedRegisters) + { + // Case #1. + // + // Generate: + // stp fp,lr,[sp,#-framesz]! + // + // The (totalFrameSize <= 504) condition ensures that both the pre-index STP instruction + // used in the prolog, and the post-index LDP instruction used in the epilog, can be generated. + // Note that STP and the unwind codes can handle -512, but LDP with a positive post-index value + // can only handle up to 504, and we want our prolog and epilog to match. + // + // After saving callee-saved registers, we establish the frame pointer with: + // mov fp,sp + // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. + + JITDUMP("Frame type 1. #outsz=0; #framesz=%d; LclFrameSize=%d\n", totalFrameSize, + compiler->compLclFrameSize); + + frameType = 1; + + assert(totalFrameSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); + + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -totalFrameSize, + INS_OPTS_PRE_INDEX); + compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, -totalFrameSize); + + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR + offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR + } + else if (totalFrameSize <= 512) + { + // Case #2. + // + // The (totalFrameSize <= 512) condition ensures the callee-saved registers can all be saved using STP + // with signed offset encoding. The maximum positive STP offset is 504, but when storing a pair of + // 8 byte registers, the largest actual offset we use would be 512 - 8 * 2 = 496. And STR with positive + // offset has a range 0 to 32760. + // + // After saving callee-saved registers, we establish the frame pointer with: + // add fp,sp,#outsz + // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. + + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + JITDUMP("Frame type 4 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + frameType = 4; + + // The frame will be allocated below, when the callee-saved registers are saved. This might mean a + // separate SUB instruction or the SP adjustment might be folded in to the first STP if there is + // no outgoing argument space AND no local frame space, that is, if the only thing the frame does + // is save callee-saved registers (and possibly varargs argument registers). + calleeSaveSPDelta = totalFrameSize; + + offset = (int)compiler->compLclFrameSize; + } + else + { + JITDUMP("Frame type 2 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + frameType = 2; + + // Generate: + // sub sp,sp,#framesz + // stp fp,lr,[sp,#outsz] // note that by necessity, #outsz <= #framesz - 16, so #outsz <= 496. + + assert(totalFrameSize - compiler->lvaOutgoingArgSpaceSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); + + GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize); + compiler->unwindAllocStack(totalFrameSize); + + assert(compiler->lvaOutgoingArgSpaceSize + 2 * REGSIZE_BYTES <= (unsigned)totalFrameSize); + + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, + compiler->lvaOutgoingArgSpaceSize); + compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize); + + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR + offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR + } + } + else + { + // Case 5 or 6. + // + // First, the callee-saved registers will be saved, and the callee-saved register code must use + // pre-index to subtract from SP as the first instruction. It must also leave space for varargs + // registers to be stored. For example: + // stp r19,r20,[sp,#-96]! + // stp d8,d9,[sp,#16] + // ... save varargs incoming integer registers ... + // Note that all SP alterations must be 16-byte aligned. We have already calculated any alignment to be + // lower on the stack than the callee-saved registers (see lvaAlignFrame() for how we calculate + // alignment). So, if there is an odd number of callee-saved registers, we use (for example, with just + // one saved register): + // sub sp,sp,#16 + // str r19,[sp,#8] + // This is one additional instruction, but it centralizes the aligned space. Otherwise, it might be + // possible to have two 8-byte alignment padding words, one below the callee-saved registers, and one + // above them. If that is preferable, we could implement it. + // + // Note that any varargs saved space will always be 16-byte aligned, since there are 8 argument + // registers. + // + // Then, define #remainingFrameSz = #framesz - (callee-saved size + varargs space + possible alignment + // padding from above). Note that #remainingFrameSz must not be zero, since we still need to save FP,SP. + // + // Generate: + // sub sp,sp,#remainingFrameSz + // or, for large frames: + // mov rX, #remainingFrameSz // maybe multiple instructions + // sub sp,sp,rX + // + // followed by: + // stp fp,lr,[sp,#outsz] + // add fp,sp,#outsz + // + // However, we need to handle the case where #outsz is larger than the constant signed offset encoding + // can handle. And, once again, we might need to deal with #outsz that is not aligned to 16-bytes (i.e., + // STACK_ALIGN). So, in the case of large #outsz we will have an additional SP adjustment, using one of + // the following sequences: + // + // Define #remainingFrameSz2 = #remainingFrameSz - #outsz. + // + // sub sp,sp,#remainingFrameSz2 // if #remainingFrameSz2 is 16-byte aligned + // stp fp,lr,[sp] + // mov fp,sp + // sub sp,sp,#outsz // in this case, #outsz must also be 16-byte aligned + // + // Or: + // + // sub sp,sp,roundUp(#remainingFrameSz2,16) // if #remainingFrameSz2 is not 16-byte aligned (it is + // // always guaranteed to be 8 byte aligned). + // stp fp,lr,[sp,#8] // it will always be #8 in the unaligned case + // add fp,sp,#8 + // sub sp,sp,#outsz - #8 + // + // (As usual, for a large constant "#outsz - #8", we might need multiple instructions: + // mov rX, #outsz - #8 // maybe multiple instructions + // sub sp,sp,rX + // ) + // + // Note that even if we align the SP alterations, that does not imply that we are creating empty alignment + // slots. In fact, we are not; any empty alignment slots were calculated in + // Compiler::lvaAssignFrameOffsets() and its callees. + + int calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize; + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + JITDUMP("Frame type 5 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + // This case is much simpler, because we allocate space for the callee-saved register area, including + // FP/LR. Note the SP adjustment might be SUB or be folded into the first store as a predecrement. + // Then, we use a single SUB to establish the rest of the frame. We need to be careful about where + // to establish the frame pointer, as there is a limit of 2040 bytes offset from SP to FP in the + // unwind codes when FP is established. + frameType = 5; + } + else + { + JITDUMP("Frame type 3 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + frameType = 3; + + calleeSaveSPDeltaUnaligned -= 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll save later. + + // We'll take care of these later, but callee-saved regs code shouldn't see them. + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); + } + + assert(calleeSaveSPDeltaUnaligned >= 0); + assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned. + calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN); + + offset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned; + + JITDUMP(" calleeSaveSPDelta=%d, offset=%d\n", calleeSaveSPDelta, offset); + + // At most one alignment slot between SP and where we store the callee-saved registers. + assert((offset == 0) || (offset == REGSIZE_BYTES)); + } + } + else + { + // No frame pointer (no chaining). + assert((maskSaveRegsInt & RBM_FP) == 0); + assert((maskSaveRegsInt & RBM_LR) != 0); + + // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using + // 'stp' if we only have one callee-saved register plus LR to save. + + NYI("Frame without frame pointer"); + offset = 0; + } + + assert(frameType != 0); + + JITDUMP(" offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta); + genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, -calleeSaveSPDelta); + + offset += genCountBits(maskSaveRegsInt | maskSaveRegsFloat) * REGSIZE_BYTES; + + // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here, + // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't + // need to add codes at all. + + if (compiler->info.compIsVarArgs) + { + JITDUMP(" compIsVarArgs=true\n"); + + // There are 8 general-purpose registers to home, thus 'offset' must be 16-byte aligned here. + assert((offset % 16) == 0); + for (regNumber reg1 = REG_ARG_FIRST; reg1 < REG_ARG_LAST; reg1 = REG_NEXT(REG_NEXT(reg1))) + { + regNumber reg2 = REG_NEXT(reg1); + // stp REG, REG + 1, [SP, #offset] + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, offset); + compiler->unwindNop(); + offset += 2 * REGSIZE_BYTES; + } + } + + // By default, we'll establish the frame pointer chain. (Note that currently frames without FP are NYI.) + bool establishFramePointer = true; + + // If we do establish the frame pointer, what is the amount we add to SP to do so? + unsigned offsetSpToSavedFp = 0; + + auto emitUnrolledStackProbeLoop = [this, probePageSize, initReg, pInitRegZeroed](int currentSpToFpLrLoc, + int lastProbedLocToCurrentSp) { + // We can not call a stack probe helper before storing lr register on the stack + // since the call would trash that register. + // Instead for relatively small frames (smaller than STACK_PROBE_HELPER_FRAME_SIZE_PAGES) + // the JIT emits unrolled stack probing loop. + // "stp fp, lr, [sp, #fpLrLoc]" would also count as a probe, hence we use and maintain + // the value of currentSpToFpLrLoc in the algorithm below. + + assert(currentSpToFpLrLoc < STACK_PROBE_HELPER_FRAME_SIZE_PAGES * probePageSize); + + // Generate the following code + // + // sub sp, sp, #probePageSize + // ldr xzr, [sp,#probeLocToCurrentSp] + // ... + // sub sp, sp, #probePageSize + // ldr xzr, [sp,#probeLocToCurrentSp] + // + // until sp is closer than probePageSize to a location + // where fp,lr register pair will be written. + + int lastProbedLocToFpLrLoc = lastProbedLocToCurrentSp + currentSpToFpLrLoc; + + while (currentSpToFpLrLoc > probePageSize) + { + const int probeLocToFpLrLoc = lastProbedLocToFpLrLoc - probePageSize; + + genStackPointerAdjustment(-probePageSize, initReg, pInitRegZeroed, /* reportUnwindData */ true); + currentSpToFpLrLoc -= probePageSize; + + const int probeLocToCurrentSp = probeLocToFpLrLoc - currentSpToFpLrLoc; + + GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, probeLocToCurrentSp); + compiler->unwindNop(); + + lastProbedLocToFpLrLoc = probeLocToFpLrLoc; + } + + // The loop doesn't have to stop at a location where fp,lr register pair will be written. + // Therefore, we need to return the distance between updated sp value and that location to + // a user of this lambda. + return currentSpToFpLrLoc; + }; + + if (frameType == 1) + { + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + assert(offsetSpToSavedFp == 0); + } + else if (frameType == 2) + { + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + + offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; + } + else if (frameType == 3) + { + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + + int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; + assert(remainingFrameSz > 0); + assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- + // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. + + int lastProbedLocToCurrentSp = STACK_PROBE_BOUNDARY_THRESHOLD_BYTES; + + if (compiler->info.compIsVarArgs || ((maskSaveRegsInt | maskSaveRegsFloat) != 0)) + { + lastProbedLocToCurrentSp = 0; + } + + if (compiler->lvaOutgoingArgSpaceSize > 504) + { + // We can't do "stp fp,lr,[sp,#outsz]" because #outsz is too big. + // If compiler->lvaOutgoingArgSpaceSize is not aligned, we need to align the SP adjustment. + assert(remainingFrameSz > (int)compiler->lvaOutgoingArgSpaceSize); + int spAdjustment2Unaligned = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize; + int spAdjustment2 = (int)roundUp((unsigned)spAdjustment2Unaligned, STACK_ALIGN); + int alignmentAdjustment2 = spAdjustment2 - spAdjustment2Unaligned; + assert((alignmentAdjustment2 == 0) || (alignmentAdjustment2 == 8)); + + JITDUMP(" spAdjustment2=%d\n", spAdjustment2); + + int currentSpToFpLrLoc = spAdjustment2; + currentSpToFpLrLoc = emitUnrolledStackProbeLoop(currentSpToFpLrLoc, lastProbedLocToCurrentSp); + + genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -currentSpToFpLrLoc, false, initReg, + pInitRegZeroed); + offset += spAdjustment2; + + // Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub" + // included some of it) + + int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2; + assert(spAdjustment3 > 0); + assert((spAdjustment3 % 16) == 0); + + JITDUMP(" alignmentAdjustment2=%d\n", alignmentAdjustment2); + genEstablishFramePointer(alignmentAdjustment2, /* reportUnwindData */ true); + + // We just established the frame pointer chain; don't do it again. + establishFramePointer = false; + + JITDUMP(" spAdjustment3=%d\n", spAdjustment3); + + if (spAdjustment3 >= probePageSize) + { + genEmitStackProbeHelperCall(spAdjustment3, initReg, pInitRegZeroed); + } + else + { + // We've already established the frame pointer, so no need to report the stack pointer change to unwind + // info. + genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed, /* reportUnwindData */ false); + } + offset += spAdjustment3; + } + else + { + int currentSpToFpLrLoc = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize; + currentSpToFpLrLoc = emitUnrolledStackProbeLoop(currentSpToFpLrLoc, lastProbedLocToCurrentSp); + + const int currentSpToFinalSp = currentSpToFpLrLoc + compiler->lvaOutgoingArgSpaceSize; + genPrologSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, -currentSpToFinalSp, false, initReg, + pInitRegZeroed); + + offset += remainingFrameSz; + offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; + } + } + else if (frameType == 4) + { + assert(genSaveFpLrWithAllCalleeSavedRegisters); + offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 * REGSIZE_BYTES; // -2 for FP, LR + } + else if (frameType == 5) + { + assert(genSaveFpLrWithAllCalleeSavedRegisters); + + offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 * REGSIZE_BYTES; // -2 for FP, LR + JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); + genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); + + // We just established the frame pointer chain; don't do it again. + establishFramePointer = false; + int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; + + assert(remainingFrameSz > 0); + assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- + // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. + + JITDUMP(" remainingFrameSz=%d\n", remainingFrameSz); + + if (remainingFrameSz >= probePageSize) + { + genEmitStackProbeHelperCall(remainingFrameSz, initReg, pInitRegZeroed); + } + else + { + // We've already established the frame pointer, so no need to report the unwind info at this point. + genStackPointerAdjustment(-remainingFrameSz, initReg, pInitRegZeroed, /* reportUnwindData */ false); + } + + offset += remainingFrameSz; + } + else + { + unreached(); + } + + if (establishFramePointer) + { + JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); + genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); + } + + assert(offset == totalFrameSize); +#endif // TARGET_ARM64 +} + #endif // TARGET_ARMARCH diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 3933ace5f345f4..2c46b09788a0f6 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -4832,601 +4832,6 @@ void CodeGen::genCheckUseBlockInit() } } -/*----------------------------------------------------------------------------- - * - * Push any callee-saved registers we have used - */ - -#if defined(TARGET_ARM64) -void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed) -#else -void CodeGen::genPushCalleeSavedRegisters() -#endif -{ - assert(compiler->compGeneratingProlog); - -#if defined(TARGET_XARCH) - // x86/x64 doesn't support push of xmm/ymm regs, therefore consider only integer registers for pushing onto stack - // here. Space for float registers to be preserved is stack allocated and saved as part of prolog sequence and not - // here. - regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_INT_CALLEE_SAVED; -#else // !defined(TARGET_XARCH) - regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; -#endif - -#if ETW_EBP_FRAMED - if (!isFramePointerUsed() && regSet.rsRegsModified(RBM_FPBASE)) - { - noway_assert(!"Used register RBM_FPBASE as a scratch register!"); - } -#endif - -#ifdef TARGET_XARCH - // On X86/X64 we have already pushed the FP (frame-pointer) prior to calling this method - if (isFramePointerUsed()) - { - rsPushRegs &= ~RBM_FPBASE; - } -#endif - -#ifdef TARGET_ARMARCH - // On ARM we push the FP (frame-pointer) here along with all other callee saved registers - if (isFramePointerUsed()) - rsPushRegs |= RBM_FPBASE; - - // - // It may be possible to skip pushing/popping lr for leaf methods. However, such optimization would require - // changes in GC suspension architecture. - // - // We would need to guarantee that a tight loop calling a virtual leaf method can be suspended for GC. Today, we - // generate partially interruptible code for both the method that contains the tight loop with the call and the leaf - // method. GC suspension depends on return address hijacking in this case. Return address hijacking depends - // on the return address to be saved on the stack. If we skipped pushing/popping lr, the return address would never - // be saved on the stack and the GC suspension would time out. - // - // So if we wanted to skip pushing pushing/popping lr for leaf frames, we would also need to do one of - // the following to make GC suspension work in the above scenario: - // - Make return address hijacking work even when lr is not saved on the stack. - // - Generate fully interruptible code for loops that contains calls - // - Generate fully interruptible code for leaf methods - // - // Given the limited benefit from this optimization (<10k for CoreLib NGen image), the extra complexity - // is not worth it. - // - rsPushRegs |= RBM_LR; // We must save the return address (in the LR register) - - regSet.rsMaskCalleeSaved = rsPushRegs; -#endif // TARGET_ARMARCH - -#ifdef DEBUG - if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs)) - { - printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ", - compiler->compCalleeRegsPushed, genCountBits(rsPushRegs)); - dspRegMask(rsPushRegs); - printf("\n"); - assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs)); - } -#endif // DEBUG - -#if defined(TARGET_ARM) - regMaskTP maskPushRegsFloat = rsPushRegs & RBM_ALLFLOAT; - regMaskTP maskPushRegsInt = rsPushRegs & ~maskPushRegsFloat; - - maskPushRegsInt |= genStackAllocRegisterMask(compiler->compLclFrameSize, maskPushRegsFloat); - - assert(FitsIn(maskPushRegsInt)); - inst_IV(INS_push, (int)maskPushRegsInt); - compiler->unwindPushMaskInt(maskPushRegsInt); - - if (maskPushRegsFloat != 0) - { - genPushFltRegs(maskPushRegsFloat); - compiler->unwindPushMaskFloat(maskPushRegsFloat); - } -#elif defined(TARGET_ARM64) - // See the document "ARM64 JIT Frame Layout" and/or "ARM64 Exception Data" for more details or requirements and - // options. Case numbers in comments here refer to this document. See also Compiler::lvaAssignFrameOffsets() - // for pictures of the general frame layouts, and CodeGen::genFuncletProlog() implementations (per architecture) - // for pictures of the funclet frame layouts. - // - // For most frames, generate, e.g.: - // stp fp, lr, [sp,-0x80]! // predecrement SP with full frame size, and store FP/LR pair. - // stp r19, r20, [sp, 0x60] // store at positive offset from SP established above, into callee-saved area - // // at top of frame (highest addresses). - // stp r21, r22, [sp, 0x70] - // - // Notes: - // 1. We don't always need to save FP. If FP isn't saved, then LR is saved with the other callee-saved registers - // at the top of the frame. - // 2. If we save FP, then the first store is FP, LR. - // 3. General-purpose registers are 8 bytes, floating-point registers are 16 bytes, but FP/SIMD registers only - // preserve their lower 8 bytes, by calling convention. - // 4. For frames with varargs, we spill the integer register arguments to the stack, so all the arguments are - // consecutive, and at the top of the frame. - // 5. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc). - // - // For functions with GS and localloc, we change the frame so the frame pointer and LR are saved at the top - // of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same - // rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP. - // Since this frame type is relatively rare, we force using it via stress modes, for additional coverage. - // - // The frames look like the following (simplified to only include components that matter for establishing the - // frames). See also Compiler::lvaAssignFrameOffsets(). - // - // Frames with FP, LR saved at bottom of frame (above outgoing argument space): - // - // | | - // |-----------------------| - // | incoming arguments | - // +=======================+ <---- Caller's SP - // | Varargs regs space | // Only for varargs functions; 64 bytes - // |-----------------------| - // |Callee saved registers | // not including FP/LR; multiple of 8 bytes - // |-----------------------| - // | PSP slot | // 8 bytes (omitted in CoreRT ABI) - // |-----------------------| - // | locals, temps, etc. | - // |-----------------------| - // | possible GS cookie | - // |-----------------------| - // | Saved LR | // 8 bytes - // |-----------------------| - // | Saved FP | // 8 bytes - // |-----------------------| - // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) - // |-----------------------| <---- Ambient SP - // | | | - // ~ | Stack grows ~ - // | | downward | - // V - // - // Frames with FP, LR saved at top of frame (below saved varargs incoming arguments): - // - // | | - // |-----------------------| - // | incoming arguments | - // +=======================+ <---- Caller's SP - // | Varargs regs space | // Only for varargs functions; 64 bytes - // |-----------------------| - // | Saved LR | // 8 bytes - // |-----------------------| - // | Saved FP | // 8 bytes - // |-----------------------| - // |Callee saved registers | // not including FP/LR; multiple of 8 bytes - // |-----------------------| - // | PSP slot | // 8 bytes (omitted in CoreRT ABI) - // |-----------------------| - // | locals, temps, etc. | - // |-----------------------| - // | possible GS cookie | - // |-----------------------| - // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) - // |-----------------------| <---- Ambient SP - // | | | - // ~ | Stack grows ~ - // | | downward | - // V - // - - int totalFrameSize = genTotalFrameSize(); - - int offset; // This will be the starting place for saving the callee-saved registers, in increasing order. - - regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT; - regMaskTP maskSaveRegsInt = rsPushRegs & ~maskSaveRegsFloat; - -#ifdef DEBUG - if (verbose) - { - printf("Save float regs: "); - dspRegMask(maskSaveRegsFloat); - printf("\n"); - printf("Save int regs: "); - dspRegMask(maskSaveRegsInt); - printf("\n"); - } -#endif // DEBUG - - // The frameType number is arbitrary, is defined below, and corresponds to one of the frame styles we - // generate based on various sizes. - int frameType = 0; - - // The amount to subtract from SP before starting to store the callee-saved registers. It might be folded into the - // first save instruction as a "predecrement" amount, if possible. - int calleeSaveSPDelta = 0; - - if (isFramePointerUsed()) - { - // We need to save both FP and LR. - - assert((maskSaveRegsInt & RBM_FP) != 0); - assert((maskSaveRegsInt & RBM_LR) != 0); - - // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address - // (FP and LR) are protected from buffer overrun by the GS cookie. If FP/LR are at the lowest addresses, - // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will - // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our - // saved FP/LR. In that case, we save FP/LR along with the rest of the callee-saved registers, above - // the GS cookie. - // - // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to - // create a frame pointer chain. - // - // Do we need another frame pointer register to get good code quality in the case of having the frame pointer - // point high in the frame, so we can take advantage of arm64's preference for positive offsets? C++ native - // code dedicates callee-saved x19 to this, so generates: - // mov x19, sp - // in the prolog, then uses x19 for local var accesses. Given that this case is so rare, we currently do - // not do this. That means that negative offsets from FP might need to use the reserved register to form - // the local variable offset for an addressing mode. - - if (((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize <= 504)) && - !genSaveFpLrWithAllCalleeSavedRegisters) - { - // Case #1. - // - // Generate: - // stp fp,lr,[sp,#-framesz]! - // - // The (totalFrameSize <= 504) condition ensures that both the pre-index STP instruction - // used in the prolog, and the post-index LDP instruction used in the epilog, can be generated. - // Note that STP and the unwind codes can handle -512, but LDP with a positive post-index value - // can only handle up to 504, and we want our prolog and epilog to match. - // - // After saving callee-saved registers, we establish the frame pointer with: - // mov fp,sp - // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. - - JITDUMP("Frame type 1. #outsz=0; #framesz=%d; LclFrameSize=%d\n", totalFrameSize, - compiler->compLclFrameSize); - - frameType = 1; - - assert(totalFrameSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); - - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -totalFrameSize, - INS_OPTS_PRE_INDEX); - compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, -totalFrameSize); - - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR - offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR - } - else if (totalFrameSize <= 512) - { - // Case #2. - // - // The (totalFrameSize <= 512) condition ensures the callee-saved registers can all be saved using STP - // with signed offset encoding. The maximum positive STP offset is 504, but when storing a pair of - // 8 byte registers, the largest actual offset we use would be 512 - 8 * 2 = 496. And STR with positive - // offset has a range 0 to 32760. - // - // After saving callee-saved registers, we establish the frame pointer with: - // add fp,sp,#outsz - // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. - - if (genSaveFpLrWithAllCalleeSavedRegisters) - { - JITDUMP("Frame type 4 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - frameType = 4; - - // The frame will be allocated below, when the callee-saved registers are saved. This might mean a - // separate SUB instruction or the SP adjustment might be folded in to the first STP if there is - // no outgoing argument space AND no local frame space, that is, if the only thing the frame does - // is save callee-saved registers (and possibly varargs argument registers). - calleeSaveSPDelta = totalFrameSize; - - offset = (int)compiler->compLclFrameSize; - } - else - { - JITDUMP("Frame type 2 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - frameType = 2; - - // Generate: - // sub sp,sp,#framesz - // stp fp,lr,[sp,#outsz] // note that by necessity, #outsz <= #framesz - 16, so #outsz <= 496. - - assert(totalFrameSize - compiler->lvaOutgoingArgSpaceSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); - - GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize); - compiler->unwindAllocStack(totalFrameSize); - - assert(compiler->lvaOutgoingArgSpaceSize + 2 * REGSIZE_BYTES <= (unsigned)totalFrameSize); - - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, - compiler->lvaOutgoingArgSpaceSize); - compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize); - - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR - offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR - } - } - else - { - // Case 5 or 6. - // - // First, the callee-saved registers will be saved, and the callee-saved register code must use - // pre-index to subtract from SP as the first instruction. It must also leave space for varargs - // registers to be stored. For example: - // stp r19,r20,[sp,#-96]! - // stp d8,d9,[sp,#16] - // ... save varargs incoming integer registers ... - // Note that all SP alterations must be 16-byte aligned. We have already calculated any alignment to be - // lower on the stack than the callee-saved registers (see lvaAlignFrame() for how we calculate - // alignment). So, if there is an odd number of callee-saved registers, we use (for example, with just - // one saved register): - // sub sp,sp,#16 - // str r19,[sp,#8] - // This is one additional instruction, but it centralizes the aligned space. Otherwise, it might be - // possible to have two 8-byte alignment padding words, one below the callee-saved registers, and one - // above them. If that is preferable, we could implement it. - // - // Note that any varargs saved space will always be 16-byte aligned, since there are 8 argument - // registers. - // - // Then, define #remainingFrameSz = #framesz - (callee-saved size + varargs space + possible alignment - // padding from above). Note that #remainingFrameSz must not be zero, since we still need to save FP,SP. - // - // Generate: - // sub sp,sp,#remainingFrameSz - // or, for large frames: - // mov rX, #remainingFrameSz // maybe multiple instructions - // sub sp,sp,rX - // - // followed by: - // stp fp,lr,[sp,#outsz] - // add fp,sp,#outsz - // - // However, we need to handle the case where #outsz is larger than the constant signed offset encoding - // can handle. And, once again, we might need to deal with #outsz that is not aligned to 16-bytes (i.e., - // STACK_ALIGN). So, in the case of large #outsz we will have an additional SP adjustment, using one of - // the following sequences: - // - // Define #remainingFrameSz2 = #remainingFrameSz - #outsz. - // - // sub sp,sp,#remainingFrameSz2 // if #remainingFrameSz2 is 16-byte aligned - // stp fp,lr,[sp] - // mov fp,sp - // sub sp,sp,#outsz // in this case, #outsz must also be 16-byte aligned - // - // Or: - // - // sub sp,sp,roundUp(#remainingFrameSz2,16) // if #remainingFrameSz2 is not 16-byte aligned (it is - // // always guaranteed to be 8 byte aligned). - // stp fp,lr,[sp,#8] // it will always be #8 in the unaligned case - // add fp,sp,#8 - // sub sp,sp,#outsz - #8 - // - // (As usual, for a large constant "#outsz - #8", we might need multiple instructions: - // mov rX, #outsz - #8 // maybe multiple instructions - // sub sp,sp,rX - // ) - // - // Note that even if we align the SP alterations, that does not imply that we are creating empty alignment - // slots. In fact, we are not; any empty alignment slots were calculated in - // Compiler::lvaAssignFrameOffsets() and its callees. - - int calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize; - if (genSaveFpLrWithAllCalleeSavedRegisters) - { - JITDUMP("Frame type 5 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - // This case is much simpler, because we allocate space for the callee-saved register area, including - // FP/LR. Note the SP adjustment might be SUB or be folded into the first store as a predecrement. - // Then, we use a single SUB to establish the rest of the frame. We need to be careful about where - // to establish the frame pointer, as there is a limit of 2040 bytes offset from SP to FP in the - // unwind codes when FP is established. - frameType = 5; - } - else - { - JITDUMP("Frame type 3 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - frameType = 3; - - calleeSaveSPDeltaUnaligned -= 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll save later. - - // We'll take care of these later, but callee-saved regs code shouldn't see them. - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); - } - - assert(calleeSaveSPDeltaUnaligned >= 0); - assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned. - calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN); - - offset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned; - - JITDUMP(" calleeSaveSPDelta=%d, offset=%d\n", calleeSaveSPDelta, offset); - - // At most one alignment slot between SP and where we store the callee-saved registers. - assert((offset == 0) || (offset == REGSIZE_BYTES)); - } - } - else - { - // No frame pointer (no chaining). - assert((maskSaveRegsInt & RBM_FP) == 0); - assert((maskSaveRegsInt & RBM_LR) != 0); - - // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using - // 'stp' if we only have one callee-saved register plus LR to save. - - NYI("Frame without frame pointer"); - offset = 0; - } - - assert(frameType != 0); - - JITDUMP(" offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta); - genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, -calleeSaveSPDelta); - - offset += genCountBits(maskSaveRegsInt | maskSaveRegsFloat) * REGSIZE_BYTES; - - // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here, - // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't - // need to add codes at all. - - if (compiler->info.compIsVarArgs) - { - JITDUMP(" compIsVarArgs=true\n"); - - // There are 8 general-purpose registers to home, thus 'offset' must be 16-byte aligned here. - assert((offset % 16) == 0); - for (regNumber reg1 = REG_ARG_FIRST; reg1 < REG_ARG_LAST; reg1 = REG_NEXT(REG_NEXT(reg1))) - { - regNumber reg2 = REG_NEXT(reg1); - // stp REG, REG + 1, [SP, #offset] - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, offset); - compiler->unwindNop(); - offset += 2 * REGSIZE_BYTES; - } - } - - // By default, we'll establish the frame pointer chain. (Note that currently frames without FP are NYI.) - bool establishFramePointer = true; - - // If we do establish the frame pointer, what is the amount we add to SP to do so? - unsigned offsetSpToSavedFp = 0; - - if (frameType == 1) - { - assert(!genSaveFpLrWithAllCalleeSavedRegisters); - assert(offsetSpToSavedFp == 0); - } - else if (frameType == 2) - { - assert(!genSaveFpLrWithAllCalleeSavedRegisters); - - offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; - } - else if (frameType == 3) - { - assert(!genSaveFpLrWithAllCalleeSavedRegisters); - - int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; - assert(remainingFrameSz > 0); - assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- - // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. - - if (compiler->lvaOutgoingArgSpaceSize > 504) - { - // We can't do "stp fp,lr,[sp,#outsz]" because #outsz is too big. - // If compiler->lvaOutgoingArgSpaceSize is not aligned, we need to align the SP adjustment. - assert(remainingFrameSz > (int)compiler->lvaOutgoingArgSpaceSize); - int spAdjustment2Unaligned = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize; - int spAdjustment2 = (int)roundUp((unsigned)spAdjustment2Unaligned, STACK_ALIGN); - int alignmentAdjustment2 = spAdjustment2 - spAdjustment2Unaligned; - assert((alignmentAdjustment2 == 0) || (alignmentAdjustment2 == 8)); - - JITDUMP(" spAdjustment2=%d\n", spAdjustment2); - - genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -spAdjustment2, false, initReg, pInitRegZeroed); - offset += spAdjustment2; - - // Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub" - // included some of it) - - int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2; - assert(spAdjustment3 > 0); - assert((spAdjustment3 % 16) == 0); - - JITDUMP(" alignmentAdjustment2=%d\n", alignmentAdjustment2); - genEstablishFramePointer(alignmentAdjustment2, /* reportUnwindData */ true); - - // We just established the frame pointer chain; don't do it again. - establishFramePointer = false; - - JITDUMP(" spAdjustment3=%d\n", spAdjustment3); - - // We've already established the frame pointer, so no need to report the stack pointer change to unwind - // info. - genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed, /* reportUnwindData */ false); - offset += spAdjustment3; - } - else - { - genPrologSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, -remainingFrameSz, false, initReg, - pInitRegZeroed); - offset += remainingFrameSz; - - offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; - } - } - else if (frameType == 4) - { - assert(genSaveFpLrWithAllCalleeSavedRegisters); - offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - - 2 * REGSIZE_BYTES; // -2 for FP, LR - } - else if (frameType == 5) - { - assert(genSaveFpLrWithAllCalleeSavedRegisters); - - offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - - 2 * REGSIZE_BYTES; // -2 for FP, LR - JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); - genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); - - // We just established the frame pointer chain; don't do it again. - establishFramePointer = false; - - int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; - assert(remainingFrameSz > 0); - assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- - // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. - - JITDUMP(" remainingFrameSz=%d\n", remainingFrameSz); - - // We've already established the frame pointer, so no need to report the stack pointer change to unwind info. - genStackPointerAdjustment(-remainingFrameSz, initReg, pInitRegZeroed, /* reportUnwindData */ false); - offset += remainingFrameSz; - } - else - { - unreached(); - } - - if (establishFramePointer) - { - JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); - genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); - } - - assert(offset == totalFrameSize); - -#elif defined(TARGET_XARCH) - // Push backwards so we match the order we will pop them in the epilog - // and all the other code that expects it to be in this order. - for (regNumber reg = REG_INT_LAST; rsPushRegs != RBM_NONE; reg = REG_PREV(reg)) - { - regMaskTP regBit = genRegMask(reg); - - if ((regBit & rsPushRegs) != 0) - { - inst_RV(INS_push, reg, TYP_REF); - compiler->unwindPush(reg); -#ifdef USING_SCOPE_INFO - if (!doubleAlignOrFramePointerUsed()) - { - psiAdjustStackLevel(REGSIZE_BYTES); - } -#endif // USING_SCOPE_INFO - rsPushRegs &= ~regBit; - } - } - -#else - assert(!"Unknown TARGET"); -#endif // TARGET* -} - #if defined(TARGET_ARM) void CodeGen::genPushFltRegs(regMaskTP regMask) @@ -7107,10 +6512,8 @@ void CodeGen::genFinalizeFrame() #endif // TARGET_X86 #ifdef TARGET_ARM - // Make sure that callee-saved registers used by call to a stack probing helper generated for very large stack - // frames - // (see `getVeryLargeFrameSize`) are pushed on stack. - if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize()) + // Make sure that callee-saved registers used by call to a stack probing helper generated are pushed on stack. + if (compiler->compLclFrameSize >= compiler->eeGetPageSize()) { regSet.rsSetRegsModified(RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET | RBM_STACK_PROBE_HELPER_TRASH); @@ -7695,37 +7098,26 @@ void CodeGen::genFnProlog() } #endif // TARGET_ARM -#if defined(TARGET_XARCH) - if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize()) + tempMask = initRegs & ~excludeMask & ~regSet.rsMaskResvd; + + if (tempMask != RBM_NONE) { - // We currently must use REG_EAX on x86 here - // because the loop's backwards branch depends upon the size of EAX encodings - assert(initReg == REG_EAX); + // We will use one of the registers that we were planning to zero init anyway. + // We pick the lowest register number. + tempMask = genFindLowestBit(tempMask); + initReg = genRegNumFromMask(tempMask); } + // Next we prefer to use one of the unused argument registers. + // If they aren't available we use one of the caller-saved integer registers. else -#endif // TARGET_XARCH { - tempMask = initRegs & ~excludeMask & ~regSet.rsMaskResvd; - + tempMask = regSet.rsGetModifiedRegsMask() & RBM_ALLINT & ~excludeMask & ~regSet.rsMaskResvd; if (tempMask != RBM_NONE) { - // We will use one of the registers that we were planning to zero init anyway. - // We pick the lowest register number. + // We pick the lowest register number tempMask = genFindLowestBit(tempMask); initReg = genRegNumFromMask(tempMask); } - // Next we prefer to use one of the unused argument registers. - // If they aren't available we use one of the caller-saved integer registers. - else - { - tempMask = regSet.rsGetModifiedRegsMask() & RBM_ALLINT & ~excludeMask & ~regSet.rsMaskResvd; - if (tempMask != RBM_NONE) - { - // We pick the lowest register number - tempMask = genFindLowestBit(tempMask); - initReg = genRegNumFromMask(tempMask); - } - } } noway_assert(!compiler->compMethodRequiresPInvokeFrame() || (initReg != REG_PINVOKE_FRAME)); @@ -7787,16 +7179,6 @@ void CodeGen::genFnProlog() #endif // TARGET_XARCH #ifdef TARGET_ARM64 - // Probe large frames now, if necessary, since genPushCalleeSavedRegisters() will allocate the frame. Note that - // for arm64, genAllocLclFrame only probes the frame; it does not actually allocate it (it does not change SP). - // For arm64, we are probing the frame before the callee-saved registers are saved. The 'initReg' might have - // been calculated to be one of the callee-saved registers (say, if all the integer argument registers are - // in use, and perhaps with other conditions being satisfied). This is ok in other cases, after the callee-saved - // registers have been saved. So instead of letting genAllocLclFrame use initReg as a temporary register, - // always use REG_SCRATCH. We don't care if it trashes it, so ignore the initRegZeroed output argument. - bool ignoreInitRegZeroed = false; - genAllocLclFrame(compiler->compLclFrameSize, REG_SCRATCH, &ignoreInitRegZeroed, - intRegState.rsCalleeRegArgMaskLiveIn); genPushCalleeSavedRegisters(initReg, &initRegZeroed); #else // !TARGET_ARM64 genPushCalleeSavedRegisters(); @@ -7841,7 +7223,7 @@ void CodeGen::genFnProlog() if (maskStackAlloc == RBM_NONE) { - genAllocLclFrame(compiler->compLclFrameSize, initReg, &initRegZeroed, intRegState.rsCalleeRegArgMaskLiveIn); + genAllocLclFrame(compiler->compLclFrameSize, initReg, &initRegZeroed); } #endif // !TARGET_ARM64 @@ -9056,26 +8438,12 @@ void CodeGen::genFuncletProlog(BasicBlock* block) bool isFilter = (block->bbCatchTyp == BBCT_FILTER); - regMaskTP maskArgRegsLiveIn; - if (isFilter) - { - maskArgRegsLiveIn = RBM_R0 | RBM_R1; - } - else if ((block->bbCatchTyp == BBCT_FINALLY) || (block->bbCatchTyp == BBCT_FAULT)) - { - maskArgRegsLiveIn = RBM_NONE; - } - else - { - maskArgRegsLiveIn = RBM_R0; - } - regNumber initReg = REG_R3; // R3 is never live on entry to a funclet, so it can be trashed bool initRegZeroed = false; if (maskStackAlloc == RBM_NONE) { - genAllocLclFrame(genFuncletInfo.fiSpDelta, initReg, &initRegZeroed, maskArgRegsLiveIn); + genAllocLclFrame(genFuncletInfo.fiSpDelta, initReg, &initRegZeroed); } // This is the end of the OS-reported prolog for purposes of unwinding @@ -9362,20 +8730,10 @@ void CodeGen::genFuncletProlog(BasicBlock* block) // Callee saved int registers are pushed to stack. genPushCalleeSavedRegisters(); - regMaskTP maskArgRegsLiveIn; - if ((block->bbCatchTyp == BBCT_FINALLY) || (block->bbCatchTyp == BBCT_FAULT)) - { - maskArgRegsLiveIn = RBM_ARG_0; - } - else - { - maskArgRegsLiveIn = RBM_ARG_0 | RBM_ARG_2; - } - regNumber initReg = REG_EBP; // We already saved EBP, so it can be trashed bool initRegZeroed = false; - genAllocLclFrame(genFuncletInfo.fiSpDelta, initReg, &initRegZeroed, maskArgRegsLiveIn); + genAllocLclFrame(genFuncletInfo.fiSpDelta, initReg, &initRegZeroed); // Callee saved float registers are copied to stack in their assigned stack slots // after allocating space for them as part of funclet frame. diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5611a018ac89b8..ae291d2cacbe2c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1988,12 +1988,11 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode) // initReg - register to use as a scratch register. // pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if // this call sets 'initReg' to a non-zero value. -// maskArgRegsLiveIn - incoming argument registers that are currently live. // // Return value: // None // -void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) +void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed) { assert(compiler->compGeneratingProlog); @@ -8962,4 +8961,63 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper) #endif // PROFILING_SUPPORTED +/*----------------------------------------------------------------------------- + * + * Push any callee-saved registers we have used + */ + +void CodeGen::genPushCalleeSavedRegisters() +{ + assert(compiler->compGeneratingProlog); + + // x86/x64 doesn't support push of xmm/ymm regs, therefore consider only integer registers for pushing onto stack + // here. Space for float registers to be preserved is stack allocated and saved as part of prolog sequence and not + // here. + regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_INT_CALLEE_SAVED; + +#if ETW_EBP_FRAMED + if (!isFramePointerUsed() && regSet.rsRegsModified(RBM_FPBASE)) + { + noway_assert(!"Used register RBM_FPBASE as a scratch register!"); + } +#endif + + // On X86/X64 we have already pushed the FP (frame-pointer) prior to calling this method + if (isFramePointerUsed()) + { + rsPushRegs &= ~RBM_FPBASE; + } + +#ifdef DEBUG + if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs)) + { + printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ", + compiler->compCalleeRegsPushed, genCountBits(rsPushRegs)); + dspRegMask(rsPushRegs); + printf("\n"); + assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs)); + } +#endif // DEBUG + + // Push backwards so we match the order we will pop them in the epilog + // and all the other code that expects it to be in this order. + for (regNumber reg = REG_INT_LAST; rsPushRegs != RBM_NONE; reg = REG_PREV(reg)) + { + regMaskTP regBit = genRegMask(reg); + + if ((regBit & rsPushRegs) != 0) + { + inst_RV(INS_push, reg, TYP_REF); + compiler->unwindPush(reg); +#ifdef USING_SCOPE_INFO + if (!doubleAlignOrFramePointerUsed()) + { + psiAdjustStackLevel(REGSIZE_BYTES); + } +#endif // USING_SCOPE_INFO + rsPushRegs &= ~regBit; + } + } +} + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 3feabf558b94ce..fc860bec02bb91 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7479,19 +7479,6 @@ class Compiler return (target_size_t)eeGetEEInfo()->osPageSize; } - // Returns the frame size at which we will generate a loop to probe the stack. - target_size_t getVeryLargeFrameSize() - { -#ifdef TARGET_ARM - // The looping probe code is 40 bytes, whereas the straight-line probing for - // the (0x2000..0x3000) case is 44, so use looping for anything 0x2000 bytes - // or greater, to generate smaller code. - return 2 * eeGetPageSize(); -#else - return 3 * eeGetPageSize(); -#endif - } - //------------------------------------------------------------------------ // VirtualStubParam: virtual stub dispatch extra parameter (slot address). // diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index a9abde124fc37b..95405bed1255aa 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5997,6 +5997,14 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() { codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames } + + const unsigned probePageSize = (unsigned)eeGetPageSize(); + + if (compLclFrameSize >= STACK_PROBE_HELPER_FRAME_SIZE_PAGES * probePageSize) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); + } + #endif // TARGET_ARM64 #ifdef TARGET_XARCH diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index d4d501e5fd72d3..3d45ef834a23fa 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -1552,6 +1552,13 @@ typedef unsigned char regNumberSmall; // For arm64, this is the maximum prolog establishment pre-indexed (that is SP pre-decrement) offset. #define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 512 + #define STACK_PROBE_HELPER_FRAME_SIZE_PAGES 4 + #define REG_STACK_PROBE_HELPER_ARG REG_R9 + #define RBM_STACK_PROBE_HELPER_ARG RBM_R9 + #define REG_STACK_PROBE_HELPER_CALL_TARGET REG_IP0 + #define RBM_STACK_PROBE_HELPER_CALL_TARGET RBM_IP0 + #define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE + // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. "MLA (by element)") // have encoding that restricts what registers that can be used for the indexed element when the element size is H (i.e. 2 bytes). #define RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15) diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index dcdfda4df350d6..930395b56dc7e7 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -1106,7 +1106,7 @@ DelayLoad_Helper\suffix: // The following helper will access ("probe") a word on each page of the stack // starting with the page right beneath sp down to the one pointed to by r4. // The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame. -// The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required. +// The call to the helper will be emitted by JIT in the function/funclet prolog when stack frame is larger than an OS page. // On entry: // r4 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize]) // sp - points to some byte on the last probed page @@ -1115,23 +1115,23 @@ DelayLoad_Helper\suffix: // r5 - is not preserved // // NOTE: this helper will probe at least one page below the one pointed to by sp. -#define PAGE_SIZE 0x1000 -#define PAGE_SIZE_LOG2 12 +#define PROBE_PAGE_SIZE 4096 +#define PROBE_PAGE_SIZE_LOG2 12 LEAF_ENTRY JIT_StackProbe, _TEXT PROLOG_PUSH "{r7}" PROLOG_STACK_SAVE r7 - mov r5, sp // r5 points to some byte on the last probed page - bfc r5, #0, #PAGE_SIZE_LOG2 // r5 points to the **lowest address** on the last probed page + mov r5, sp // r5 points to some byte on the last probed page + bfc r5, #0, #PROBE_PAGE_SIZE_LOG2 // r5 points to the **lowest address** on the last probed page mov sp, r5 ProbeLoop: - // Immediate operand for the following instruction can not be greater than 4095. - sub sp, #(PAGE_SIZE - 4) // sp points to the **fourth** byte on the **next page** to probe - ldr r5, [sp, #-4]! // sp points to the lowest address on the **last probed** page + // Immediate operand for the following instruction can not be greater than 4095. + sub sp, #(PROBE_PAGE_SIZE - 4) // sp points to the **fourth** byte on the **next page** to probe + ldr r5, [sp, #-4]! // sp points to the lowest address on the **last probed** page cmp sp, r4 - bhi ProbeLoop // If (sp > r4), then we need to probe at least one more page. + bhi ProbeLoop // If (sp > r4), then we need to probe at least one more page. EPILOG_STACK_RESTORE r7 EPILOG_POP "{r7}" diff --git a/src/coreclr/vm/arm/asmhelpers.asm b/src/coreclr/vm/arm/asmhelpers.asm index 1565c13d3d4563..d20540e62090e0 100644 --- a/src/coreclr/vm/arm/asmhelpers.asm +++ b/src/coreclr/vm/arm/asmhelpers.asm @@ -1835,7 +1835,7 @@ $__RealName ;; The following helper will access ("probe") a word on each page of the stack ;; starting with the page right beneath sp down to the one pointed to by r4. ;; The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame. -;; The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required. +;; The call to the helper will be emitted by JIT in the function/funclet prolog when stack frame is larger than an OS page. ;;----------------------------------------------------------------------------- ; On entry: ; r4 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize]) @@ -1845,21 +1845,23 @@ $__RealName ; r5 - is not preserved ; ; NOTE: this helper will probe at least one page below the one pointed to by sp. -#define PAGE_SIZE_LOG2 12 +#define PROBE_PAGE_SIZE 4096 +#define PROBE_PAGE_SIZE_LOG2 12 + LEAF_ENTRY JIT_StackProbe PROLOG_PUSH {r7} PROLOG_STACK_SAVE r7 - mov r5, sp ; r5 points to some byte on the last probed page - bfc r5, #0, #PAGE_SIZE_LOG2 ; r5 points to the **lowest address** on the last probed page + mov r5, sp ; r5 points to some byte on the last probed page + bfc r5, #0, #PROBE_PAGE_SIZE_LOG2 ; r5 points to the **lowest address** on the last probed page mov sp, r5 ProbeLoop - ; Immediate operand for the following instruction can not be greater than 4095. - sub sp, #(PAGE_SIZE - 4) ; sp points to the **fourth** byte on the **next page** to probe - ldr r5, [sp, #-4]! ; sp points to the lowest address on the **last probed** page + ; Immediate operand for the following instruction can not be greater than 4095. + sub sp, #(PROBE_PAGE_SIZE - 4) ; sp points to the **fourth** byte on the **next page** to probe + ldr r5, [sp, #-4]! ; sp points to the lowest address on the **last probed** page cmp sp, r4 - bhi ProbeLoop ; if (sp > r4), then we need to probe at least one more page. + bhi ProbeLoop ; if (sp > r4), then we need to probe at least one more page. EPILOG_STACK_RESTORE r7 EPILOG_POP {r7} diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index d9c7fcd1eb3428..7d38662732081a 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -1258,6 +1258,26 @@ GenerateProfileHelper ProfileTailcall, PROFILE_TAILCALL #endif +#define PROBE_PAGE_SIZE_LOG2 12 +#define PROBE_PAGE_SIZE 4096 + +LEAF_ENTRY JIT_StackProbe, _TEXT + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -16 + + add x30, x9, #(PROBE_PAGE_SIZE >> 12), lsl #12 // x30 points to some byte on the page **immediately preceding** the last page to probe (i.e. pointed to by x9) + bfc x30, #0, #(PROBE_PAGE_SIZE_LOG2) // x30 points to the **lowest address** on that page + +LOCAL_LABEL(ProbeLoop): + sub sp, sp, #(PROBE_PAGE_SIZE >> 12), lsl #12 // sp points to some byte on the **next page** to probe + ldr wzr, [sp] // sp points to some byte on the **last probed** page + cmp sp, x30, lsl #0 + bhs LOCAL_LABEL(ProbeLoop) // if (sp >= x30), then we need to probe at least one more page + + mov sp, fp + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 16 + ret lr +LEAF_END_MARKED JIT_StackProbe + #ifdef FEATURE_TIERED_COMPILATION NESTED_ENTRY OnCallCountThresholdReachedStub, _TEXT, NoHandler diff --git a/src/coreclr/vm/arm64/asmhelpers.asm b/src/coreclr/vm/arm64/asmhelpers.asm index 2f9227b1d80df6..4edb7d22111f9f 100644 --- a/src/coreclr/vm/arm64/asmhelpers.asm +++ b/src/coreclr/vm/arm64/asmhelpers.asm @@ -1478,6 +1478,40 @@ __HelperNakedFuncName SETS "$helper":CC:"Naked" #endif +;;----------------------------------------------------------------------------- +;; The following helper will access ("probe") a word on each page of the stack +;; starting with the page right beneath sp down to the one pointed to by x9. +;; The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame. +;; The call to the helper will be emitted by JIT in the function/funclet prolog when large stack frame is required. +;;----------------------------------------------------------------------------- +; On entry: +; x9 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize]) +; sp - points to some byte on the last probed page +; On exit: +; x9 - is preserved +; +; NOTE: this helper will probe at least one page below the one pointed to by sp. +#define PROBE_PAGE_SIZE 4096 +#define PROBE_PAGE_SIZE_LOG2 12 + + LEAF_ENTRY JIT_StackProbe + PROLOG_SAVE_REG_PAIR fp, lr, #-16! + + add x30, x9, #(PROBE_PAGE_SIZE >> 12), lsl #12 ; x30 points to some byte on the page **immediately preceding** the last page to probe (i.e. pointed to by x9) + bfc x30, #0, #(PROBE_PAGE_SIZE_LOG2) ; x30 points to the **lowest address** on that page + +ProbeLoop + sub sp, sp, #(PROBE_PAGE_SIZE >> 12), lsl #12 ; sp points to some byte on the **next page** to probe + ldr wzr, [sp] ; sp points to some byte on the **last probed** page + cmp sp, x30, lsl #0 + bhs ProbeLoop ; if (sp >= x30), then we need to probe at least one more page + + mov sp, fp + EPILOG_RESTORE_REG_PAIR fp, lr, 16! + + ret lr + LEAF_END_MARKED JIT_StackProbe + #ifdef FEATURE_TIERED_COMPILATION IMPORT OnCallCountThresholdReached diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 34afa5b2458c98..8614710e4987fd 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -6587,9 +6587,7 @@ IsDebuggerFault(EXCEPTION_RECORD *pExceptionRecord, #endif // TARGET_UNIX -#ifndef TARGET_ARM64 EXTERN_C void JIT_StackProbe_End(); -#endif // TARGET_ARM64 #ifdef FEATURE_EH_FUNCLETS @@ -6654,9 +6652,7 @@ bool IsIPInMarkedJitHelper(UINT_PTR uControlPc) CHECK_RANGE(JIT_WriteBarrier) CHECK_RANGE(JIT_CheckedWriteBarrier) CHECK_RANGE(JIT_ByRefWriteBarrier) -#if !defined(TARGET_ARM64) CHECK_RANGE(JIT_StackProbe) -#endif // !TARGET_ARM64 #else #ifdef TARGET_UNIX CHECK_RANGE(JIT_WriteBarrierGroup) diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index e0e4f81e42ad21..22c868511045b3 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -10227,7 +10227,16 @@ void CEEInfo::getEEInfo(CORINFO_EE_INFO *pEEInfoOut) _ASSERTE(sizeof(ReversePInvokeFrame) <= pEEInfoOut->sizeOfReversePInvokeFrame); #endif - pEEInfoOut->osPageSize = GetOsPageSize(); + if (!IsReadyToRunCompilation()) + { + pEEInfoOut->osPageSize = GetOsPageSize(); + } + else + { + // In AOT scenarios the VM reports to the JIT the minimum page size. + pEEInfoOut->osPageSize = 0x1000; + } + pEEInfoOut->maxUncheckedOffsetForNullObject = MAX_UNCHECKED_OFFSET_FOR_NULL_OBJECT; pEEInfoOut->targetAbi = CORINFO_CORECLR_ABI; diff --git a/src/coreclr/vm/jitinterface.h b/src/coreclr/vm/jitinterface.h index 07a035a278611a..7d8ef5ff391db7 100644 --- a/src/coreclr/vm/jitinterface.h +++ b/src/coreclr/vm/jitinterface.h @@ -399,9 +399,7 @@ extern "C" void STDCALL JIT_MemCpy(void *dest, const void *src, SIZE_T count); void STDMETHODCALLTYPE JIT_ProfilerEnterLeaveTailcallStub(UINT_PTR ProfilerHandle); -#ifndef TARGET_ARM64 void STDCALL JIT_StackProbe(); -#endif // TARGET_ARM64 };