Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
35269 views
//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file contains the X86 implementation of TargetFrameLowering class.9//10//===----------------------------------------------------------------------===//1112#include "X86FrameLowering.h"13#include "MCTargetDesc/X86MCTargetDesc.h"14#include "X86InstrBuilder.h"15#include "X86InstrInfo.h"16#include "X86MachineFunctionInfo.h"17#include "X86Subtarget.h"18#include "X86TargetMachine.h"19#include "llvm/ADT/Statistic.h"20#include "llvm/CodeGen/LivePhysRegs.h"21#include "llvm/CodeGen/MachineFrameInfo.h"22#include "llvm/CodeGen/MachineFunction.h"23#include "llvm/CodeGen/MachineInstrBuilder.h"24#include "llvm/CodeGen/MachineModuleInfo.h"25#include "llvm/CodeGen/MachineRegisterInfo.h"26#include "llvm/CodeGen/WinEHFuncInfo.h"27#include "llvm/IR/DataLayout.h"28#include "llvm/IR/EHPersonalities.h"29#include "llvm/IR/Function.h"30#include "llvm/IR/Module.h"31#include "llvm/MC/MCAsmInfo.h"32#include "llvm/MC/MCObjectFileInfo.h"33#include "llvm/MC/MCSymbol.h"34#include "llvm/Support/Debug.h"35#include "llvm/Support/LEB128.h"36#include "llvm/Target/TargetOptions.h"37#include <cstdlib>3839#define DEBUG_TYPE "x86-fl"4041STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");42STATISTIC(NumFrameExtraProbe,43"Number of extra stack probes generated in prologue");44STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");4546using namespace llvm;4748X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,49MaybeAlign StackAlignOverride)50: TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),51STI.is64Bit() ? -8 : -4),52STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {53// Cache a bunch of frame-related predicates for this subtarget.54SlotSize = TRI->getSlotSize();55Is64Bit = STI.is64Bit();56IsLP64 = STI.isTarget64BitLP64();57// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.58Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();59StackPtr = TRI->getStackRegister();60}6162bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {63return !MF.getFrameInfo().hasVarSizedObjects() &&64!MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&65!MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();66}6768/// canSimplifyCallFramePseudos - If there is a reserved call frame, the69/// call frame pseudos can be simplified. Having a FP, as in the default70/// implementation, is not sufficient here since we can't always use it.71/// Use a more nuanced condition.72bool X86FrameLowering::canSimplifyCallFramePseudos(73const MachineFunction &MF) const {74return hasReservedCallFrame(MF) ||75MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||76(hasFP(MF) && !TRI->hasStackRealignment(MF)) ||77TRI->hasBasePointer(MF);78}7980// needsFrameIndexResolution - Do we need to perform FI resolution for81// this function. Normally, this is required only when the function82// has any stack objects. However, FI resolution actually has another job,83// not apparent from the title - it resolves callframesetup/destroy84// that were not simplified earlier.85// So, this is required for x86 functions that have push sequences even86// when there are no stack objects.87bool X86FrameLowering::needsFrameIndexResolution(88const MachineFunction &MF) const {89return MF.getFrameInfo().hasStackObjects() ||90MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();91}9293/// hasFP - Return true if the specified function should have a dedicated frame94/// pointer register. This is true if the function has variable sized allocas95/// or if frame pointer elimination is disabled.96bool X86FrameLowering::hasFP(const MachineFunction &MF) const {97const MachineFrameInfo &MFI = MF.getFrameInfo();98return (MF.getTarget().Options.DisableFramePointerElim(MF) ||99TRI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||100MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||101MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||102MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||103MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||104MFI.hasStackMap() || MFI.hasPatchPoint() ||105(isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()));106}107108static unsigned getSUBriOpcode(bool IsLP64) {109return IsLP64 ? X86::SUB64ri32 : X86::SUB32ri;110}111112static unsigned getADDriOpcode(bool IsLP64) {113return IsLP64 ? X86::ADD64ri32 : X86::ADD32ri;114}115116static unsigned getSUBrrOpcode(bool IsLP64) {117return IsLP64 ? X86::SUB64rr : X86::SUB32rr;118}119120static unsigned getADDrrOpcode(bool IsLP64) {121return IsLP64 ? X86::ADD64rr : X86::ADD32rr;122}123124static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {125return IsLP64 ? X86::AND64ri32 : X86::AND32ri;126}127128static unsigned getLEArOpcode(bool IsLP64) {129return IsLP64 ? X86::LEA64r : X86::LEA32r;130}131132static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {133if (Use64BitReg) {134if (isUInt<32>(Imm))135return X86::MOV32ri64;136if (isInt<32>(Imm))137return X86::MOV64ri32;138return X86::MOV64ri;139}140return X86::MOV32ri;141}142143// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the144// value written by the PUSH from the stack. The processor tracks these marked145// instructions internally and fast-forwards register data between matching PUSH146// and POP instructions, without going through memory or through the training147// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient148// memory-renaming optimization can be used.149//150// The PPX hint is purely a performance hint. Instructions with this hint have151// the same functional semantics as those without. PPX hints set by the152// compiler that violate the balancing rule may turn off the PPX optimization,153// but they will not affect program semantics.154//155// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp156// are not considered).157//158// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2159// GPRs at a time to/from the stack.160static unsigned getPUSHOpcode(const X86Subtarget &ST) {161return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)162: X86::PUSH32r;163}164static unsigned getPOPOpcode(const X86Subtarget &ST) {165return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)166: X86::POP32r;167}168static unsigned getPUSH2Opcode(const X86Subtarget &ST) {169return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;170}171static unsigned getPOP2Opcode(const X86Subtarget &ST) {172return ST.hasPPX() ? X86::POP2P : X86::POP2;173}174175static bool isEAXLiveIn(MachineBasicBlock &MBB) {176for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {177unsigned Reg = RegMask.PhysReg;178179if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||180Reg == X86::AH || Reg == X86::AL)181return true;182}183184return false;185}186187/// Check if the flags need to be preserved before the terminators.188/// This would be the case, if the eflags is live-in of the region189/// composed by the terminators or live-out of that region, without190/// being defined by a terminator.191static bool192flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {193for (const MachineInstr &MI : MBB.terminators()) {194bool BreakNext = false;195for (const MachineOperand &MO : MI.operands()) {196if (!MO.isReg())197continue;198Register Reg = MO.getReg();199if (Reg != X86::EFLAGS)200continue;201202// This terminator needs an eflags that is not defined203// by a previous another terminator:204// EFLAGS is live-in of the region composed by the terminators.205if (!MO.isDef())206return true;207// This terminator defines the eflags, i.e., we don't need to preserve it.208// However, we still need to check this specific terminator does not209// read a live-in value.210BreakNext = true;211}212// We found a definition of the eflags, no need to preserve them.213if (BreakNext)214return false;215}216217// None of the terminators use or define the eflags.218// Check if they are live-out, that would imply we need to preserve them.219for (const MachineBasicBlock *Succ : MBB.successors())220if (Succ->isLiveIn(X86::EFLAGS))221return true;222223return false;224}225226/// emitSPUpdate - Emit a series of instructions to increment / decrement the227/// stack pointer by a constant value.228void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,229MachineBasicBlock::iterator &MBBI,230const DebugLoc &DL, int64_t NumBytes,231bool InEpilogue) const {232bool isSub = NumBytes < 0;233uint64_t Offset = isSub ? -NumBytes : NumBytes;234MachineInstr::MIFlag Flag =235isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;236237uint64_t Chunk = (1LL << 31) - 1;238239MachineFunction &MF = *MBB.getParent();240const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();241const X86TargetLowering &TLI = *STI.getTargetLowering();242const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);243244// It's ok to not take into account large chunks when probing, as the245// allocation is split in smaller chunks anyway.246if (EmitInlineStackProbe && !InEpilogue) {247248// This pseudo-instruction is going to be expanded, potentially using a249// loop, by inlineStackProbe().250BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset);251return;252} else if (Offset > Chunk) {253// Rather than emit a long series of instructions for large offsets,254// load the offset into a register and do one sub/add255unsigned Reg = 0;256unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);257258if (isSub && !isEAXLiveIn(MBB))259Reg = Rax;260else261Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);262263unsigned AddSubRROpc =264isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);265if (Reg) {266BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg)267.addImm(Offset)268.setMIFlag(Flag);269MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)270.addReg(StackPtr)271.addReg(Reg);272MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.273return;274} else if (Offset > 8 * Chunk) {275// If we would need more than 8 add or sub instructions (a >16GB stack276// frame), it's worth spilling RAX to materialize this immediate.277// pushq %rax278// movabsq +-$Offset+-SlotSize, %rax279// addq %rsp, %rax280// xchg %rax, (%rsp)281// movq (%rsp), %rsp282assert(Is64Bit && "can't have 32-bit 16GB stack frame");283BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))284.addReg(Rax, RegState::Kill)285.setMIFlag(Flag);286// Subtract is not commutative, so negate the offset and always use add.287// Subtract 8 less and add 8 more to account for the PUSH we just did.288if (isSub)289Offset = -(Offset - SlotSize);290else291Offset = Offset + SlotSize;292BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax)293.addImm(Offset)294.setMIFlag(Flag);295MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)296.addReg(Rax)297.addReg(StackPtr);298MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.299// Exchange the new SP in RAX with the top of the stack.300addRegOffset(301BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),302StackPtr, false, 0);303// Load new SP from the top of the stack into RSP.304addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),305StackPtr, false, 0);306return;307}308}309310while (Offset) {311uint64_t ThisVal = std::min(Offset, Chunk);312if (ThisVal == SlotSize) {313// Use push / pop for slot sized adjustments as a size optimization. We314// need to find a dead register when using pop.315unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)316: TRI->findDeadCallerSavedReg(MBB, MBBI);317if (Reg) {318unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)319: (Is64Bit ? X86::POP64r : X86::POP32r);320BuildMI(MBB, MBBI, DL, TII.get(Opc))321.addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))322.setMIFlag(Flag);323Offset -= ThisVal;324continue;325}326}327328BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)329.setMIFlag(Flag);330331Offset -= ThisVal;332}333}334335MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(336MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,337const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {338assert(Offset != 0 && "zero offset stack adjustment requested");339340// On Atom, using LEA to adjust SP is preferred, but using it in the epilogue341// is tricky.342bool UseLEA;343if (!InEpilogue) {344// Check if inserting the prologue at the beginning345// of MBB would require to use LEA operations.346// We need to use LEA operations if EFLAGS is live in, because347// it means an instruction will read it before it gets defined.348UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);349} else {350// If we can use LEA for SP but we shouldn't, check that none351// of the terminators uses the eflags. Otherwise we will insert352// a ADD that will redefine the eflags and break the condition.353// Alternatively, we could move the ADD, but this may not be possible354// and is an optimization anyway.355UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());356if (UseLEA && !STI.useLeaForSP())357UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);358// If that assert breaks, that means we do not do the right thing359// in canUseAsEpilogue.360assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&361"We shouldn't have allowed this insertion point");362}363364MachineInstrBuilder MI;365if (UseLEA) {366MI = addRegOffset(BuildMI(MBB, MBBI, DL,367TII.get(getLEArOpcode(Uses64BitFramePtr)),368StackPtr),369StackPtr, false, Offset);370} else {371bool IsSub = Offset < 0;372uint64_t AbsOffset = IsSub ? -Offset : Offset;373const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr)374: getADDriOpcode(Uses64BitFramePtr);375MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)376.addReg(StackPtr)377.addImm(AbsOffset);378MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.379}380return MI;381}382383int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,384MachineBasicBlock::iterator &MBBI,385bool doMergeWithPrevious) const {386if ((doMergeWithPrevious && MBBI == MBB.begin()) ||387(!doMergeWithPrevious && MBBI == MBB.end()))388return 0;389390MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;391392PI = skipDebugInstructionsBackward(PI, MBB.begin());393// It is assumed that ADD/SUB/LEA instruction is succeded by one CFI394// instruction, and that there are no DBG_VALUE or other instructions between395// ADD/SUB/LEA and its corresponding CFI instruction.396/* TODO: Add support for the case where there are multiple CFI instructions397below the ADD/SUB/LEA, e.g.:398...399add400cfi_def_cfa_offset401cfi_offset402...403*/404if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())405PI = std::prev(PI);406407unsigned Opc = PI->getOpcode();408int Offset = 0;409410if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&411PI->getOperand(0).getReg() == StackPtr) {412assert(PI->getOperand(1).getReg() == StackPtr);413Offset = PI->getOperand(2).getImm();414} else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&415PI->getOperand(0).getReg() == StackPtr &&416PI->getOperand(1).getReg() == StackPtr &&417PI->getOperand(2).getImm() == 1 &&418PI->getOperand(3).getReg() == X86::NoRegister &&419PI->getOperand(5).getReg() == X86::NoRegister) {420// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.421Offset = PI->getOperand(4).getImm();422} else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&423PI->getOperand(0).getReg() == StackPtr) {424assert(PI->getOperand(1).getReg() == StackPtr);425Offset = -PI->getOperand(2).getImm();426} else427return 0;428429PI = MBB.erase(PI);430if (PI != MBB.end() && PI->isCFIInstruction()) {431auto CIs = MBB.getParent()->getFrameInstructions();432MCCFIInstruction CI = CIs[PI->getOperand(0).getCFIIndex()];433if (CI.getOperation() == MCCFIInstruction::OpDefCfaOffset ||434CI.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)435PI = MBB.erase(PI);436}437if (!doMergeWithPrevious)438MBBI = skipDebugInstructionsForward(PI, MBB.end());439440return Offset;441}442443void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,444MachineBasicBlock::iterator MBBI,445const DebugLoc &DL,446const MCCFIInstruction &CFIInst,447MachineInstr::MIFlag Flag) const {448MachineFunction &MF = *MBB.getParent();449unsigned CFIIndex = MF.addFrameInst(CFIInst);450451if (CFIInst.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)452MF.getInfo<X86MachineFunctionInfo>()->setHasCFIAdjustCfa(true);453454BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))455.addCFIIndex(CFIIndex)456.setMIFlag(Flag);457}458459/// Emits Dwarf Info specifying offsets of callee saved registers and460/// frame pointer. This is called only when basic block sections are enabled.461void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(462MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {463MachineFunction &MF = *MBB.getParent();464if (!hasFP(MF)) {465emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);466return;467}468const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();469const Register FramePtr = TRI->getFrameRegister(MF);470const Register MachineFramePtr =471STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))472: FramePtr;473unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);474// Offset = space for return address + size of the frame pointer itself.475int64_t Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);476BuildCFI(MBB, MBBI, DebugLoc{},477MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));478emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);479}480481void X86FrameLowering::emitCalleeSavedFrameMoves(482MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,483const DebugLoc &DL, bool IsPrologue) const {484MachineFunction &MF = *MBB.getParent();485MachineFrameInfo &MFI = MF.getFrameInfo();486const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();487X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();488489// Add callee saved registers to move list.490const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();491492// Calculate offsets.493for (const CalleeSavedInfo &I : CSI) {494int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());495Register Reg = I.getReg();496unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);497498if (IsPrologue) {499if (X86FI->getStackPtrSaveMI()) {500// +2*SlotSize because there is return address and ebp at the bottom501// of the stack.502// | retaddr |503// | ebp |504// | |<--ebp505Offset += 2 * SlotSize;506SmallString<64> CfaExpr;507CfaExpr.push_back(dwarf::DW_CFA_expression);508uint8_t buffer[16];509CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));510CfaExpr.push_back(2);511Register FramePtr = TRI->getFrameRegister(MF);512const Register MachineFramePtr =513STI.isTarget64BitILP32()514? Register(getX86SubSuperRegister(FramePtr, 64))515: FramePtr;516unsigned DwarfFramePtr = MRI->getDwarfRegNum(MachineFramePtr, true);517CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));518CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));519BuildCFI(MBB, MBBI, DL,520MCCFIInstruction::createEscape(nullptr, CfaExpr.str()),521MachineInstr::FrameSetup);522} else {523BuildCFI(MBB, MBBI, DL,524MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));525}526} else {527BuildCFI(MBB, MBBI, DL,528MCCFIInstruction::createRestore(nullptr, DwarfReg));529}530}531if (auto *MI = X86FI->getStackPtrSaveMI()) {532int FI = MI->getOperand(1).getIndex();533int64_t Offset = MFI.getObjectOffset(FI) + 2 * SlotSize;534SmallString<64> CfaExpr;535Register FramePtr = TRI->getFrameRegister(MF);536const Register MachineFramePtr =537STI.isTarget64BitILP32()538? Register(getX86SubSuperRegister(FramePtr, 64))539: FramePtr;540unsigned DwarfFramePtr = MRI->getDwarfRegNum(MachineFramePtr, true);541CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));542uint8_t buffer[16];543CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));544CfaExpr.push_back(dwarf::DW_OP_deref);545546SmallString<64> DefCfaExpr;547DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);548DefCfaExpr.append(buffer, buffer + encodeSLEB128(CfaExpr.size(), buffer));549DefCfaExpr.append(CfaExpr.str());550// DW_CFA_def_cfa_expression: DW_OP_breg5 offset, DW_OP_deref551BuildCFI(MBB, MBBI, DL,552MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str()),553MachineInstr::FrameSetup);554}555}556557void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,558MachineBasicBlock &MBB) const {559const MachineFunction &MF = *MBB.getParent();560561// Insertion point.562MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();563564// Fake a debug loc.565DebugLoc DL;566if (MBBI != MBB.end())567DL = MBBI->getDebugLoc();568569// Zero out FP stack if referenced. Do this outside of the loop below so that570// it's done only once.571const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();572for (MCRegister Reg : RegsToZero.set_bits()) {573if (!X86::RFP80RegClass.contains(Reg))574continue;575576unsigned NumFPRegs = ST.is64Bit() ? 8 : 7;577for (unsigned i = 0; i != NumFPRegs; ++i)578BuildMI(MBB, MBBI, DL, TII.get(X86::LD_F0));579580for (unsigned i = 0; i != NumFPRegs; ++i)581BuildMI(MBB, MBBI, DL, TII.get(X86::ST_FPrr)).addReg(X86::ST0);582break;583}584585// For GPRs, we only care to clear out the 32-bit register.586BitVector GPRsToZero(TRI->getNumRegs());587for (MCRegister Reg : RegsToZero.set_bits())588if (TRI->isGeneralPurposeRegister(MF, Reg)) {589GPRsToZero.set(getX86SubSuperRegister(Reg, 32));590RegsToZero.reset(Reg);591}592593// Zero out the GPRs first.594for (MCRegister Reg : GPRsToZero.set_bits())595TII.buildClearRegister(Reg, MBB, MBBI, DL);596597// Zero out the remaining registers.598for (MCRegister Reg : RegsToZero.set_bits())599TII.buildClearRegister(Reg, MBB, MBBI, DL);600}601602void X86FrameLowering::emitStackProbe(603MachineFunction &MF, MachineBasicBlock &MBB,604MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,605std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {606const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();607if (STI.isTargetWindowsCoreCLR()) {608if (InProlog) {609BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING))610.addImm(0 /* no explicit stack size */);611} else {612emitStackProbeInline(MF, MBB, MBBI, DL, false);613}614} else {615emitStackProbeCall(MF, MBB, MBBI, DL, InProlog, InstrNum);616}617}618619bool X86FrameLowering::stackProbeFunctionModifiesSP() const {620return STI.isOSWindows() && !STI.isTargetWin64();621}622623void X86FrameLowering::inlineStackProbe(MachineFunction &MF,624MachineBasicBlock &PrologMBB) const {625auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) {626return MI.getOpcode() == X86::STACKALLOC_W_PROBING;627});628if (Where != PrologMBB.end()) {629DebugLoc DL = PrologMBB.findDebugLoc(Where);630emitStackProbeInline(MF, PrologMBB, Where, DL, true);631Where->eraseFromParent();632}633}634635void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,636MachineBasicBlock &MBB,637MachineBasicBlock::iterator MBBI,638const DebugLoc &DL,639bool InProlog) const {640const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();641if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())642emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);643else644emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);645}646647void X86FrameLowering::emitStackProbeInlineGeneric(648MachineFunction &MF, MachineBasicBlock &MBB,649MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {650MachineInstr &AllocWithProbe = *MBBI;651uint64_t Offset = AllocWithProbe.getOperand(0).getImm();652653const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();654const X86TargetLowering &TLI = *STI.getTargetLowering();655assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&656"different expansion expected for CoreCLR 64 bit");657658const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);659uint64_t ProbeChunk = StackProbeSize * 8;660661uint64_t MaxAlign =662TRI->hasStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;663664// Synthesize a loop or unroll it, depending on the number of iterations.665// BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left666// between the unaligned rsp and current rsp.667if (Offset > ProbeChunk) {668emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,669MaxAlign % StackProbeSize);670} else {671emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,672MaxAlign % StackProbeSize);673}674}675676void X86FrameLowering::emitStackProbeInlineGenericBlock(677MachineFunction &MF, MachineBasicBlock &MBB,678MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,679uint64_t AlignOffset) const {680681const bool NeedsDwarfCFI = needsDwarfCFI(MF);682const bool HasFP = hasFP(MF);683const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();684const X86TargetLowering &TLI = *STI.getTargetLowering();685const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;686const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);687688uint64_t CurrentOffset = 0;689690assert(AlignOffset < StackProbeSize);691692// If the offset is so small it fits within a page, there's nothing to do.693if (StackProbeSize < Offset + AlignOffset) {694695uint64_t StackAdjustment = StackProbeSize - AlignOffset;696BuildStackAdjustment(MBB, MBBI, DL, -StackAdjustment, /*InEpilogue=*/false)697.setMIFlag(MachineInstr::FrameSetup);698if (!HasFP && NeedsDwarfCFI) {699BuildCFI(700MBB, MBBI, DL,701MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));702}703704addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))705.setMIFlag(MachineInstr::FrameSetup),706StackPtr, false, 0)707.addImm(0)708.setMIFlag(MachineInstr::FrameSetup);709NumFrameExtraProbe++;710CurrentOffset = StackProbeSize - AlignOffset;711}712713// For the next N - 1 pages, just probe. I tried to take advantage of714// natural probes but it implies much more logic and there was very few715// interesting natural probes to interleave.716while (CurrentOffset + StackProbeSize < Offset) {717BuildStackAdjustment(MBB, MBBI, DL, -StackProbeSize, /*InEpilogue=*/false)718.setMIFlag(MachineInstr::FrameSetup);719720if (!HasFP && NeedsDwarfCFI) {721BuildCFI(722MBB, MBBI, DL,723MCCFIInstruction::createAdjustCfaOffset(nullptr, StackProbeSize));724}725addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))726.setMIFlag(MachineInstr::FrameSetup),727StackPtr, false, 0)728.addImm(0)729.setMIFlag(MachineInstr::FrameSetup);730NumFrameExtraProbe++;731CurrentOffset += StackProbeSize;732}733734// No need to probe the tail, it is smaller than a Page.735uint64_t ChunkSize = Offset - CurrentOffset;736if (ChunkSize == SlotSize) {737// Use push for slot sized adjustments as a size optimization,738// like emitSPUpdate does when not probing.739unsigned Reg = Is64Bit ? X86::RAX : X86::EAX;740unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r;741BuildMI(MBB, MBBI, DL, TII.get(Opc))742.addReg(Reg, RegState::Undef)743.setMIFlag(MachineInstr::FrameSetup);744} else {745BuildStackAdjustment(MBB, MBBI, DL, -ChunkSize, /*InEpilogue=*/false)746.setMIFlag(MachineInstr::FrameSetup);747}748// No need to adjust Dwarf CFA offset here, the last position of the stack has749// been defined750}751752void X86FrameLowering::emitStackProbeInlineGenericLoop(753MachineFunction &MF, MachineBasicBlock &MBB,754MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,755uint64_t AlignOffset) const {756assert(Offset && "null offset");757758assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=759MachineBasicBlock::LQR_Live &&760"Inline stack probe loop will clobber live EFLAGS.");761762const bool NeedsDwarfCFI = needsDwarfCFI(MF);763const bool HasFP = hasFP(MF);764const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();765const X86TargetLowering &TLI = *STI.getTargetLowering();766const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;767const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);768769if (AlignOffset) {770if (AlignOffset < StackProbeSize) {771// Perform a first smaller allocation followed by a probe.772BuildStackAdjustment(MBB, MBBI, DL, -AlignOffset, /*InEpilogue=*/false)773.setMIFlag(MachineInstr::FrameSetup);774775addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))776.setMIFlag(MachineInstr::FrameSetup),777StackPtr, false, 0)778.addImm(0)779.setMIFlag(MachineInstr::FrameSetup);780NumFrameExtraProbe++;781Offset -= AlignOffset;782}783}784785// Synthesize a loop786NumFrameLoopProbe++;787const BasicBlock *LLVM_BB = MBB.getBasicBlock();788789MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);790MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);791792MachineFunction::iterator MBBIter = ++MBB.getIterator();793MF.insert(MBBIter, testMBB);794MF.insert(MBBIter, tailMBB);795796Register FinalStackProbed = Uses64BitFramePtr ? X86::R11797: Is64Bit ? X86::R11D798: X86::EAX;799800BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)801.addReg(StackPtr)802.setMIFlag(MachineInstr::FrameSetup);803804// save loop bound805{806const unsigned BoundOffset = alignDown(Offset, StackProbeSize);807const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);808BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)809.addReg(FinalStackProbed)810.addImm(BoundOffset)811.setMIFlag(MachineInstr::FrameSetup);812813// while in the loop, use loop-invariant reg for CFI,814// instead of the stack pointer, which changes during the loop815if (!HasFP && NeedsDwarfCFI) {816// x32 uses the same DWARF register numbers as x86-64,817// so there isn't a register number for r11d, we must use r11 instead818const Register DwarfFinalStackProbed =819STI.isTarget64BitILP32()820? Register(getX86SubSuperRegister(FinalStackProbed, 64))821: FinalStackProbed;822823BuildCFI(MBB, MBBI, DL,824MCCFIInstruction::createDefCfaRegister(825nullptr, TRI->getDwarfRegNum(DwarfFinalStackProbed, true)));826BuildCFI(MBB, MBBI, DL,827MCCFIInstruction::createAdjustCfaOffset(nullptr, BoundOffset));828}829}830831// allocate a page832BuildStackAdjustment(*testMBB, testMBB->end(), DL, -StackProbeSize,833/*InEpilogue=*/false)834.setMIFlag(MachineInstr::FrameSetup);835836// touch the page837addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))838.setMIFlag(MachineInstr::FrameSetup),839StackPtr, false, 0)840.addImm(0)841.setMIFlag(MachineInstr::FrameSetup);842843// cmp with stack pointer bound844BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))845.addReg(StackPtr)846.addReg(FinalStackProbed)847.setMIFlag(MachineInstr::FrameSetup);848849// jump850BuildMI(testMBB, DL, TII.get(X86::JCC_1))851.addMBB(testMBB)852.addImm(X86::COND_NE)853.setMIFlag(MachineInstr::FrameSetup);854testMBB->addSuccessor(testMBB);855testMBB->addSuccessor(tailMBB);856857// BB management858tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());859tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);860MBB.addSuccessor(testMBB);861862// handle tail863const uint64_t TailOffset = Offset % StackProbeSize;864MachineBasicBlock::iterator TailMBBIter = tailMBB->begin();865if (TailOffset) {866BuildStackAdjustment(*tailMBB, TailMBBIter, DL, -TailOffset,867/*InEpilogue=*/false)868.setMIFlag(MachineInstr::FrameSetup);869}870871// after the loop, switch back to stack pointer for CFI872if (!HasFP && NeedsDwarfCFI) {873// x32 uses the same DWARF register numbers as x86-64,874// so there isn't a register number for esp, we must use rsp instead875const Register DwarfStackPtr =876STI.isTarget64BitILP32()877? Register(getX86SubSuperRegister(StackPtr, 64))878: Register(StackPtr);879880BuildCFI(*tailMBB, TailMBBIter, DL,881MCCFIInstruction::createDefCfaRegister(882nullptr, TRI->getDwarfRegNum(DwarfStackPtr, true)));883}884885// Update Live In information886fullyRecomputeLiveIns({tailMBB, testMBB});887}888889void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(890MachineFunction &MF, MachineBasicBlock &MBB,891MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {892const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();893assert(STI.is64Bit() && "different expansion needed for 32 bit");894assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");895const TargetInstrInfo &TII = *STI.getInstrInfo();896const BasicBlock *LLVM_BB = MBB.getBasicBlock();897898assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=899MachineBasicBlock::LQR_Live &&900"Inline stack probe loop will clobber live EFLAGS.");901902// RAX contains the number of bytes of desired stack adjustment.903// The handling here assumes this value has already been updated so as to904// maintain stack alignment.905//906// We need to exit with RSP modified by this amount and execute suitable907// page touches to notify the OS that we're growing the stack responsibly.908// All stack probing must be done without modifying RSP.909//910// MBB:911// SizeReg = RAX;912// ZeroReg = 0913// CopyReg = RSP914// Flags, TestReg = CopyReg - SizeReg915// FinalReg = !Flags.Ovf ? TestReg : ZeroReg916// LimitReg = gs magic thread env access917// if FinalReg >= LimitReg goto ContinueMBB918// RoundBB:919// RoundReg = page address of FinalReg920// LoopMBB:921// LoopReg = PHI(LimitReg,ProbeReg)922// ProbeReg = LoopReg - PageSize923// [ProbeReg] = 0924// if (ProbeReg > RoundReg) goto LoopMBB925// ContinueMBB:926// RSP = RSP - RAX927// [rest of original MBB]928929// Set up the new basic blocks930MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);931MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);932MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);933934MachineFunction::iterator MBBIter = std::next(MBB.getIterator());935MF.insert(MBBIter, RoundMBB);936MF.insert(MBBIter, LoopMBB);937MF.insert(MBBIter, ContinueMBB);938939// Split MBB and move the tail portion down to ContinueMBB.940MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);941ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());942ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);943944// Some useful constants945const int64_t ThreadEnvironmentStackLimit = 0x10;946const int64_t PageSize = 0x1000;947const int64_t PageMask = ~(PageSize - 1);948949// Registers we need. For the normal case we use virtual950// registers. For the prolog expansion we use RAX, RCX and RDX.951MachineRegisterInfo &MRI = MF.getRegInfo();952const TargetRegisterClass *RegClass = &X86::GR64RegClass;953const Register954SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),955ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),956CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),957TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),958FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),959RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),960LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),961JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),962ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);963964// SP-relative offsets where we can save RCX and RDX.965int64_t RCXShadowSlot = 0;966int64_t RDXShadowSlot = 0;967968// If inlining in the prolog, save RCX and RDX.969if (InProlog) {970// Compute the offsets. We need to account for things already971// pushed onto the stack at this point: return address, frame972// pointer (if used), and callee saves.973X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();974const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();975const bool HasFP = hasFP(MF);976977// Check if we need to spill RCX and/or RDX.978// Here we assume that no earlier prologue instruction changes RCX and/or979// RDX, so checking the block live-ins is enough.980const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX);981const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX);982int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);983// Assign the initial slot to both registers, then change RDX's slot if both984// need to be spilled.985if (IsRCXLiveIn)986RCXShadowSlot = InitSlot;987if (IsRDXLiveIn)988RDXShadowSlot = InitSlot;989if (IsRDXLiveIn && IsRCXLiveIn)990RDXShadowSlot += 8;991// Emit the saves if needed.992if (IsRCXLiveIn)993addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,994RCXShadowSlot)995.addReg(X86::RCX);996if (IsRDXLiveIn)997addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,998RDXShadowSlot)999.addReg(X86::RDX);1000} else {1001// Not in the prolog. Copy RAX to a virtual reg.1002BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);1003}10041005// Add code to MBB to check for overflow and set the new target stack pointer1006// to zero if so.1007BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)1008.addReg(ZeroReg, RegState::Undef)1009.addReg(ZeroReg, RegState::Undef);1010BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);1011BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)1012.addReg(CopyReg)1013.addReg(SizeReg);1014BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)1015.addReg(TestReg)1016.addReg(ZeroReg)1017.addImm(X86::COND_B);10181019// FinalReg now holds final stack pointer value, or zero if1020// allocation would overflow. Compare against the current stack1021// limit from the thread environment block. Note this limit is the1022// lowest touched page on the stack, not the point at which the OS1023// will cause an overflow exception, so this is just an optimization1024// to avoid unnecessarily touching pages that are below the current1025// SP but already committed to the stack by the OS.1026BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)1027.addReg(0)1028.addImm(1)1029.addReg(0)1030.addImm(ThreadEnvironmentStackLimit)1031.addReg(X86::GS);1032BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);1033// Jump if the desired stack pointer is at or above the stack limit.1034BuildMI(&MBB, DL, TII.get(X86::JCC_1))1035.addMBB(ContinueMBB)1036.addImm(X86::COND_AE);10371038// Add code to roundMBB to round the final stack pointer to a page boundary.1039RoundMBB->addLiveIn(FinalReg);1040BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)1041.addReg(FinalReg)1042.addImm(PageMask);1043BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);10441045// LimitReg now holds the current stack limit, RoundedReg page-rounded1046// final RSP value. Add code to loopMBB to decrement LimitReg page-by-page1047// and probe until we reach RoundedReg.1048if (!InProlog) {1049BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)1050.addReg(LimitReg)1051.addMBB(RoundMBB)1052.addReg(ProbeReg)1053.addMBB(LoopMBB);1054}10551056LoopMBB->addLiveIn(JoinReg);1057addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,1058false, -PageSize);10591060// Probe by storing a byte onto the stack.1061BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))1062.addReg(ProbeReg)1063.addImm(1)1064.addReg(0)1065.addImm(0)1066.addReg(0)1067.addImm(0);10681069LoopMBB->addLiveIn(RoundedReg);1070BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))1071.addReg(RoundedReg)1072.addReg(ProbeReg);1073BuildMI(LoopMBB, DL, TII.get(X86::JCC_1))1074.addMBB(LoopMBB)1075.addImm(X86::COND_NE);10761077MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();10781079// If in prolog, restore RDX and RCX.1080if (InProlog) {1081if (RCXShadowSlot) // It means we spilled RCX in the prologue.1082addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,1083TII.get(X86::MOV64rm), X86::RCX),1084X86::RSP, false, RCXShadowSlot);1085if (RDXShadowSlot) // It means we spilled RDX in the prologue.1086addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,1087TII.get(X86::MOV64rm), X86::RDX),1088X86::RSP, false, RDXShadowSlot);1089}10901091// Now that the probing is done, add code to continueMBB to update1092// the stack pointer for real.1093ContinueMBB->addLiveIn(SizeReg);1094BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)1095.addReg(X86::RSP)1096.addReg(SizeReg);10971098// Add the control flow edges we need.1099MBB.addSuccessor(ContinueMBB);1100MBB.addSuccessor(RoundMBB);1101RoundMBB->addSuccessor(LoopMBB);1102LoopMBB->addSuccessor(ContinueMBB);1103LoopMBB->addSuccessor(LoopMBB);11041105// Mark all the instructions added to the prolog as frame setup.1106if (InProlog) {1107for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {1108BeforeMBBI->setFlag(MachineInstr::FrameSetup);1109}1110for (MachineInstr &MI : *RoundMBB) {1111MI.setFlag(MachineInstr::FrameSetup);1112}1113for (MachineInstr &MI : *LoopMBB) {1114MI.setFlag(MachineInstr::FrameSetup);1115}1116for (MachineInstr &MI :1117llvm::make_range(ContinueMBB->begin(), ContinueMBBI)) {1118MI.setFlag(MachineInstr::FrameSetup);1119}1120}1121}11221123void X86FrameLowering::emitStackProbeCall(1124MachineFunction &MF, MachineBasicBlock &MBB,1125MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,1126std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {1127bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;11281129// FIXME: Add indirect thunk support and remove this.1130if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())1131report_fatal_error("Emitting stack probe calls on 64-bit with the large "1132"code model and indirect thunks not yet implemented.");11331134assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=1135MachineBasicBlock::LQR_Live &&1136"Stack probe calls will clobber live EFLAGS.");11371138unsigned CallOp;1139if (Is64Bit)1140CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;1141else1142CallOp = X86::CALLpcrel32;11431144StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);11451146MachineInstrBuilder CI;1147MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);11481149// All current stack probes take AX and SP as input, clobber flags, and1150// preserve all registers. x86_64 probes leave RSP unmodified.1151if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {1152// For the large code model, we have to call through a register. Use R11,1153// as it is scratch in all supported calling conventions.1154BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)1155.addExternalSymbol(MF.createExternalSymbolName(Symbol));1156CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);1157} else {1158CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))1159.addExternalSymbol(MF.createExternalSymbolName(Symbol));1160}11611162unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;1163unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;1164CI.addReg(AX, RegState::Implicit)1165.addReg(SP, RegState::Implicit)1166.addReg(AX, RegState::Define | RegState::Implicit)1167.addReg(SP, RegState::Define | RegState::Implicit)1168.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);11691170MachineInstr *ModInst = CI;1171if (STI.isTargetWin64() || !STI.isOSWindows()) {1172// MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.1173// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp1174// themselves. They also does not clobber %rax so we can reuse it when1175// adjusting %rsp.1176// All other platforms do not specify a particular ABI for the stack probe1177// function, so we arbitrarily define it to not adjust %esp/%rsp itself.1178ModInst =1179BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)1180.addReg(SP)1181.addReg(AX);1182}11831184// DebugInfo variable locations -- if there's an instruction number for the1185// allocation (i.e., DYN_ALLOC_*), substitute it for the instruction that1186// modifies SP.1187if (InstrNum) {1188if (STI.isTargetWin64() || !STI.isOSWindows()) {1189// Label destination operand of the subtract.1190MF.makeDebugValueSubstitution(*InstrNum,1191{ModInst->getDebugInstrNum(), 0});1192} else {1193// Label the call. The operand number is the penultimate operand, zero1194// based.1195unsigned SPDefOperand = ModInst->getNumOperands() - 2;1196MF.makeDebugValueSubstitution(1197*InstrNum, {ModInst->getDebugInstrNum(), SPDefOperand});1198}1199}12001201if (InProlog) {1202// Apply the frame setup flag to all inserted instrs.1203for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)1204ExpansionMBBI->setFlag(MachineInstr::FrameSetup);1205}1206}12071208static unsigned calculateSetFPREG(uint64_t SPAdjust) {1209// Win64 ABI has a less restrictive limitation of 240; 128 works equally well1210// and might require smaller successive adjustments.1211const uint64_t Win64MaxSEHOffset = 128;1212uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);1213// Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.1214return SEHFrameOffset & -16;1215}12161217// If we're forcing a stack realignment we can't rely on just the frame1218// info, we need to know the ABI stack alignment as well in case we1219// have a call out. Otherwise just make sure we have some alignment - we'll1220// go with the minimum SlotSize.1221uint64_t1222X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {1223const MachineFrameInfo &MFI = MF.getFrameInfo();1224Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.1225Align StackAlign = getStackAlign();1226bool HasRealign = MF.getFunction().hasFnAttribute("stackrealign");1227if (HasRealign) {1228if (MFI.hasCalls())1229MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;1230else if (MaxAlign < SlotSize)1231MaxAlign = Align(SlotSize);1232}12331234if (!Is64Bit && MF.getFunction().getCallingConv() == CallingConv::X86_INTR) {1235if (HasRealign)1236MaxAlign = (MaxAlign > 16) ? MaxAlign : Align(16);1237else1238MaxAlign = Align(16);1239}1240return MaxAlign.value();1241}12421243void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,1244MachineBasicBlock::iterator MBBI,1245const DebugLoc &DL, unsigned Reg,1246uint64_t MaxAlign) const {1247uint64_t Val = -MaxAlign;1248unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);12491250MachineFunction &MF = *MBB.getParent();1251const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();1252const X86TargetLowering &TLI = *STI.getTargetLowering();1253const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);1254const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);12551256// We want to make sure that (in worst case) less than StackProbeSize bytes1257// are not probed after the AND. This assumption is used in1258// emitStackProbeInlineGeneric.1259if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {1260{1261NumFrameLoopProbe++;1262MachineBasicBlock *entryMBB =1263MF.CreateMachineBasicBlock(MBB.getBasicBlock());1264MachineBasicBlock *headMBB =1265MF.CreateMachineBasicBlock(MBB.getBasicBlock());1266MachineBasicBlock *bodyMBB =1267MF.CreateMachineBasicBlock(MBB.getBasicBlock());1268MachineBasicBlock *footMBB =1269MF.CreateMachineBasicBlock(MBB.getBasicBlock());12701271MachineFunction::iterator MBBIter = MBB.getIterator();1272MF.insert(MBBIter, entryMBB);1273MF.insert(MBBIter, headMBB);1274MF.insert(MBBIter, bodyMBB);1275MF.insert(MBBIter, footMBB);1276const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;1277Register FinalStackProbed = Uses64BitFramePtr ? X86::R111278: Is64Bit ? X86::R11D1279: X86::EAX;12801281// Setup entry block1282{12831284entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);1285BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)1286.addReg(StackPtr)1287.setMIFlag(MachineInstr::FrameSetup);1288MachineInstr *MI =1289BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)1290.addReg(FinalStackProbed)1291.addImm(Val)1292.setMIFlag(MachineInstr::FrameSetup);12931294// The EFLAGS implicit def is dead.1295MI->getOperand(3).setIsDead();12961297BuildMI(entryMBB, DL,1298TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))1299.addReg(FinalStackProbed)1300.addReg(StackPtr)1301.setMIFlag(MachineInstr::FrameSetup);1302BuildMI(entryMBB, DL, TII.get(X86::JCC_1))1303.addMBB(&MBB)1304.addImm(X86::COND_E)1305.setMIFlag(MachineInstr::FrameSetup);1306entryMBB->addSuccessor(headMBB);1307entryMBB->addSuccessor(&MBB);1308}13091310// Loop entry block13111312{1313const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);1314BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)1315.addReg(StackPtr)1316.addImm(StackProbeSize)1317.setMIFlag(MachineInstr::FrameSetup);13181319BuildMI(headMBB, DL,1320TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))1321.addReg(StackPtr)1322.addReg(FinalStackProbed)1323.setMIFlag(MachineInstr::FrameSetup);13241325// jump to the footer if StackPtr < FinalStackProbed1326BuildMI(headMBB, DL, TII.get(X86::JCC_1))1327.addMBB(footMBB)1328.addImm(X86::COND_B)1329.setMIFlag(MachineInstr::FrameSetup);13301331headMBB->addSuccessor(bodyMBB);1332headMBB->addSuccessor(footMBB);1333}13341335// setup loop body1336{1337addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))1338.setMIFlag(MachineInstr::FrameSetup),1339StackPtr, false, 0)1340.addImm(0)1341.setMIFlag(MachineInstr::FrameSetup);13421343const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);1344BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)1345.addReg(StackPtr)1346.addImm(StackProbeSize)1347.setMIFlag(MachineInstr::FrameSetup);13481349// cmp with stack pointer bound1350BuildMI(bodyMBB, DL,1351TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))1352.addReg(FinalStackProbed)1353.addReg(StackPtr)1354.setMIFlag(MachineInstr::FrameSetup);13551356// jump back while FinalStackProbed < StackPtr1357BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))1358.addMBB(bodyMBB)1359.addImm(X86::COND_B)1360.setMIFlag(MachineInstr::FrameSetup);1361bodyMBB->addSuccessor(bodyMBB);1362bodyMBB->addSuccessor(footMBB);1363}13641365// setup loop footer1366{1367BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)1368.addReg(FinalStackProbed)1369.setMIFlag(MachineInstr::FrameSetup);1370addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))1371.setMIFlag(MachineInstr::FrameSetup),1372StackPtr, false, 0)1373.addImm(0)1374.setMIFlag(MachineInstr::FrameSetup);1375footMBB->addSuccessor(&MBB);1376}13771378fullyRecomputeLiveIns({footMBB, bodyMBB, headMBB, &MBB});1379}1380} else {1381MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)1382.addReg(Reg)1383.addImm(Val)1384.setMIFlag(MachineInstr::FrameSetup);13851386// The EFLAGS implicit def is dead.1387MI->getOperand(3).setIsDead();1388}1389}13901391bool X86FrameLowering::has128ByteRedZone(const MachineFunction &MF) const {1392// x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be1393// clobbered by any interrupt handler.1394assert(&STI == &MF.getSubtarget<X86Subtarget>() &&1395"MF used frame lowering for wrong subtarget");1396const Function &Fn = MF.getFunction();1397const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());1398return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);1399}14001401/// Return true if we need to use the restricted Windows x64 prologue and1402/// epilogue code patterns that can be described with WinCFI (.seh_*1403/// directives).1404bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const {1405return MF.getTarget().getMCAsmInfo()->usesWindowsCFI();1406}14071408bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {1409return !isWin64Prologue(MF) && MF.needsFrameMoves();1410}14111412/// Return true if an opcode is part of the REP group of instructions1413static bool isOpcodeRep(unsigned Opcode) {1414switch (Opcode) {1415case X86::REPNE_PREFIX:1416case X86::REP_MOVSB_32:1417case X86::REP_MOVSB_64:1418case X86::REP_MOVSD_32:1419case X86::REP_MOVSD_64:1420case X86::REP_MOVSQ_32:1421case X86::REP_MOVSQ_64:1422case X86::REP_MOVSW_32:1423case X86::REP_MOVSW_64:1424case X86::REP_PREFIX:1425case X86::REP_STOSB_32:1426case X86::REP_STOSB_64:1427case X86::REP_STOSD_32:1428case X86::REP_STOSD_64:1429case X86::REP_STOSQ_32:1430case X86::REP_STOSQ_64:1431case X86::REP_STOSW_32:1432case X86::REP_STOSW_64:1433return true;1434default:1435break;1436}1437return false;1438}14391440/// emitPrologue - Push callee-saved registers onto the stack, which1441/// automatically adjust the stack pointer. Adjust the stack pointer to allocate1442/// space for local variables. Also emit labels used by the exception handler to1443/// generate the exception handling frames.14441445/*1446Here's a gist of what gets emitted:14471448; Establish frame pointer, if needed1449[if needs FP]1450push %rbp1451.cfi_def_cfa_offset 161452.cfi_offset %rbp, -161453.seh_pushreg %rpb1454mov %rsp, %rbp1455.cfi_def_cfa_register %rbp14561457; Spill general-purpose registers1458[for all callee-saved GPRs]1459pushq %<reg>1460[if not needs FP]1461.cfi_def_cfa_offset (offset from RETADDR)1462.seh_pushreg %<reg>14631464; If the required stack alignment > default stack alignment1465; rsp needs to be re-aligned. This creates a "re-alignment gap"1466; of unknown size in the stack frame.1467[if stack needs re-alignment]1468and $MASK, %rsp14691470; Allocate space for locals1471[if target is Windows and allocated space > 4096 bytes]1472; Windows needs special care for allocations larger1473; than one page.1474mov $NNN, %rax1475call ___chkstk_ms/___chkstk1476sub %rax, %rsp1477[else]1478sub $NNN, %rsp14791480[if needs FP]1481.seh_stackalloc (size of XMM spill slots)1482.seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots1483[else]1484.seh_stackalloc NNN14851486; Spill XMMs1487; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,1488; they may get spilled on any platform, if the current function1489; calls @llvm.eh.unwind.init1490[if needs FP]1491[for all callee-saved XMM registers]1492movaps %<xmm reg>, -MMM(%rbp)1493[for all callee-saved XMM registers]1494.seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)1495; i.e. the offset relative to (%rbp - SEHFrameOffset)1496[else]1497[for all callee-saved XMM registers]1498movaps %<xmm reg>, KKK(%rsp)1499[for all callee-saved XMM registers]1500.seh_savexmm %<xmm reg>, KKK15011502.seh_endprologue15031504[if needs base pointer]1505mov %rsp, %rbx1506[if needs to restore base pointer]1507mov %rsp, -MMM(%rbp)15081509; Emit CFI info1510[if needs FP]1511[for all callee-saved registers]1512.cfi_offset %<reg>, (offset from %rbp)1513[else]1514.cfi_def_cfa_offset (offset from RETADDR)1515[for all callee-saved registers]1516.cfi_offset %<reg>, (offset from %rsp)15171518Notes:1519- .seh directives are emitted only for Windows 64 ABI1520- .cv_fpo directives are emitted on win32 when emitting CodeView1521- .cfi directives are emitted for all other ABIs1522- for 32-bit code, substitute %e?? registers for %r??1523*/15241525void X86FrameLowering::emitPrologue(MachineFunction &MF,1526MachineBasicBlock &MBB) const {1527assert(&STI == &MF.getSubtarget<X86Subtarget>() &&1528"MF used frame lowering for wrong subtarget");1529MachineBasicBlock::iterator MBBI = MBB.begin();1530MachineFrameInfo &MFI = MF.getFrameInfo();1531const Function &Fn = MF.getFunction();1532X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();1533uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.1534uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.1535bool IsFunclet = MBB.isEHFuncletEntry();1536EHPersonality Personality = EHPersonality::Unknown;1537if (Fn.hasPersonalityFn())1538Personality = classifyEHPersonality(Fn.getPersonalityFn());1539bool FnHasClrFunclet =1540MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;1541bool IsClrFunclet = IsFunclet && FnHasClrFunclet;1542bool HasFP = hasFP(MF);1543bool IsWin64Prologue = isWin64Prologue(MF);1544bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();1545// FIXME: Emit FPO data for EH funclets.1546bool NeedsWinFPO = !IsFunclet && STI.isTargetWin32() &&1547MF.getFunction().getParent()->getCodeViewFlag();1548bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;1549bool NeedsDwarfCFI = needsDwarfCFI(MF);1550Register FramePtr = TRI->getFrameRegister(MF);1551const Register MachineFramePtr =1552STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))1553: FramePtr;1554Register BasePtr = TRI->getBaseRegister();1555bool HasWinCFI = false;15561557// Debug location must be unknown since the first debug location is used1558// to determine the end of the prologue.1559DebugLoc DL;1560Register ArgBaseReg;15611562// Emit extra prolog for argument stack slot reference.1563if (auto *MI = X86FI->getStackPtrSaveMI()) {1564// MI is lea instruction that created in X86ArgumentStackSlotPass.1565// Creat extra prolog for stack realignment.1566ArgBaseReg = MI->getOperand(0).getReg();1567// leal 4(%esp), %basereg1568// .cfi_def_cfa %basereg, 01569// andl $-128, %esp1570// pushl -4(%basereg)1571BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::LEA64r : X86::LEA32r),1572ArgBaseReg)1573.addUse(StackPtr)1574.addImm(1)1575.addUse(X86::NoRegister)1576.addImm(SlotSize)1577.addUse(X86::NoRegister)1578.setMIFlag(MachineInstr::FrameSetup);1579if (NeedsDwarfCFI) {1580// .cfi_def_cfa %basereg, 01581unsigned DwarfStackPtr = TRI->getDwarfRegNum(ArgBaseReg, true);1582BuildCFI(MBB, MBBI, DL,1583MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, 0),1584MachineInstr::FrameSetup);1585}1586BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);1587int64_t Offset = -(int64_t)SlotSize;1588BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm))1589.addReg(ArgBaseReg)1590.addImm(1)1591.addReg(X86::NoRegister)1592.addImm(Offset)1593.addReg(X86::NoRegister)1594.setMIFlag(MachineInstr::FrameSetup);1595}15961597// Space reserved for stack-based arguments when making a (ABI-guaranteed)1598// tail call.1599unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();1600if (TailCallArgReserveSize && IsWin64Prologue)1601report_fatal_error("Can't handle guaranteed tail call under win64 yet");16021603const bool EmitStackProbeCall =1604STI.getTargetLowering()->hasStackProbeSymbol(MF);1605unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);16061607if (HasFP && X86FI->hasSwiftAsyncContext()) {1608switch (MF.getTarget().Options.SwiftAsyncFramePointer) {1609case SwiftAsyncFramePointerMode::DeploymentBased:1610if (STI.swiftAsyncContextIsDynamicallySet()) {1611// The special symbol below is absolute and has a *value* suitable to be1612// combined with the frame pointer directly.1613BuildMI(MBB, MBBI, DL, TII.get(X86::OR64rm), MachineFramePtr)1614.addUse(MachineFramePtr)1615.addUse(X86::RIP)1616.addImm(1)1617.addUse(X86::NoRegister)1618.addExternalSymbol("swift_async_extendedFramePointerFlags",1619X86II::MO_GOTPCREL)1620.addUse(X86::NoRegister);1621break;1622}1623[[fallthrough]];16241625case SwiftAsyncFramePointerMode::Always:1626assert(1627!IsWin64Prologue &&1628"win64 prologue does not set the bit 60 in the saved frame pointer");1629BuildMI(MBB, MBBI, DL, TII.get(X86::BTS64ri8), MachineFramePtr)1630.addUse(MachineFramePtr)1631.addImm(60)1632.setMIFlag(MachineInstr::FrameSetup);1633break;16341635case SwiftAsyncFramePointerMode::Never:1636break;1637}1638}16391640// Re-align the stack on 64-bit if the x86-interrupt calling convention is1641// used and an error code was pushed, since the x86-64 ABI requires a 16-byte1642// stack alignment.1643if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&1644Fn.arg_size() == 2) {1645StackSize += 8;1646MFI.setStackSize(StackSize);16471648// Update the stack pointer by pushing a register. This is the instruction1649// emitted that would be end up being emitted by a call to `emitSPUpdate`.1650// Hard-coding the update to a push avoids emitting a second1651// `STACKALLOC_W_PROBING` instruction in the save block: We know that stack1652// probing isn't needed anyways for an 8-byte update.1653// Pushing a register leaves us in a similar situation to a regular1654// function call where we know that the address at (rsp-8) is writeable.1655// That way we avoid any off-by-ones with stack probing for additional1656// stack pointer updates later on.1657BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))1658.addReg(X86::RAX, RegState::Undef)1659.setMIFlag(MachineInstr::FrameSetup);1660}16611662// If this is x86-64 and the Red Zone is not disabled, if we are a leaf1663// function, and use up to 128 bytes of stack space, don't have a frame1664// pointer, calls, or dynamic alloca then we do not need to adjust the1665// stack pointer (we fit in the Red Zone). We also check that we don't1666// push and pop from the stack.1667if (has128ByteRedZone(MF) && !TRI->hasStackRealignment(MF) &&1668!MFI.hasVarSizedObjects() && // No dynamic alloca.1669!MFI.adjustsStack() && // No calls.1670!EmitStackProbeCall && // No stack probes.1671!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.1672!MF.shouldSplitStack()) { // Regular stack1673uint64_t MinSize =1674X86FI->getCalleeSavedFrameSize() - X86FI->getTCReturnAddrDelta();1675if (HasFP)1676MinSize += SlotSize;1677X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);1678StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);1679MFI.setStackSize(StackSize);1680}16811682// Insert stack pointer adjustment for later moving of return addr. Only1683// applies to tail call optimized functions where the callee argument stack1684// size is bigger than the callers.1685if (TailCallArgReserveSize != 0) {1686BuildStackAdjustment(MBB, MBBI, DL, -(int)TailCallArgReserveSize,1687/*InEpilogue=*/false)1688.setMIFlag(MachineInstr::FrameSetup);1689}16901691// Mapping for machine moves:1692//1693// DST: VirtualFP AND1694// SRC: VirtualFP => DW_CFA_def_cfa_offset1695// ELSE => DW_CFA_def_cfa1696//1697// SRC: VirtualFP AND1698// DST: Register => DW_CFA_def_cfa_register1699//1700// ELSE1701// OFFSET < 0 => DW_CFA_offset_extended_sf1702// REG < 64 => DW_CFA_offset + Reg1703// ELSE => DW_CFA_offset_extended17041705uint64_t NumBytes = 0;1706int stackGrowth = -SlotSize;17071708// Find the funclet establisher parameter1709Register Establisher = X86::NoRegister;1710if (IsClrFunclet)1711Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;1712else if (IsFunclet)1713Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;17141715if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {1716// Immediately spill establisher into the home slot.1717// The runtime cares about this.1718// MOV64mr %rdx, 16(%rsp)1719unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;1720addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)1721.addReg(Establisher)1722.setMIFlag(MachineInstr::FrameSetup);1723MBB.addLiveIn(Establisher);1724}17251726if (HasFP) {1727assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");17281729// Calculate required stack adjustment.1730uint64_t FrameSize = StackSize - SlotSize;1731NumBytes =1732FrameSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);17331734// Callee-saved registers are pushed on stack before the stack is realigned.1735if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)1736NumBytes = alignTo(NumBytes, MaxAlign);17371738// Save EBP/RBP into the appropriate stack slot.1739BuildMI(MBB, MBBI, DL,1740TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))1741.addReg(MachineFramePtr, RegState::Kill)1742.setMIFlag(MachineInstr::FrameSetup);17431744if (NeedsDwarfCFI && !ArgBaseReg.isValid()) {1745// Mark the place where EBP/RBP was saved.1746// Define the current CFA rule to use the provided offset.1747assert(StackSize);1748BuildCFI(MBB, MBBI, DL,1749MCCFIInstruction::cfiDefCfaOffset(1750nullptr, -2 * stackGrowth + (int)TailCallArgReserveSize),1751MachineInstr::FrameSetup);17521753// Change the rule for the FramePtr to be an "offset" rule.1754unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);1755BuildCFI(MBB, MBBI, DL,1756MCCFIInstruction::createOffset(nullptr, DwarfFramePtr,17572 * stackGrowth -1758(int)TailCallArgReserveSize),1759MachineInstr::FrameSetup);1760}17611762if (NeedsWinCFI) {1763HasWinCFI = true;1764BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))1765.addImm(FramePtr)1766.setMIFlag(MachineInstr::FrameSetup);1767}17681769if (!IsFunclet) {1770if (X86FI->hasSwiftAsyncContext()) {1771assert(!IsWin64Prologue &&1772"win64 prologue does not store async context right below rbp");1773const auto &Attrs = MF.getFunction().getAttributes();17741775// Before we update the live frame pointer we have to ensure there's a1776// valid (or null) asynchronous context in its slot just before FP in1777// the frame record, so store it now.1778if (Attrs.hasAttrSomewhere(Attribute::SwiftAsync)) {1779// We have an initial context in r14, store it just before the frame1780// pointer.1781MBB.addLiveIn(X86::R14);1782BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))1783.addReg(X86::R14)1784.setMIFlag(MachineInstr::FrameSetup);1785} else {1786// No initial context, store null so that there's no pointer that1787// could be misused.1788BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64i32))1789.addImm(0)1790.setMIFlag(MachineInstr::FrameSetup);1791}17921793if (NeedsWinCFI) {1794HasWinCFI = true;1795BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))1796.addImm(X86::R14)1797.setMIFlag(MachineInstr::FrameSetup);1798}17991800BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr)1801.addUse(X86::RSP)1802.addImm(1)1803.addUse(X86::NoRegister)1804.addImm(8)1805.addUse(X86::NoRegister)1806.setMIFlag(MachineInstr::FrameSetup);1807BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64ri32), X86::RSP)1808.addUse(X86::RSP)1809.addImm(8)1810.setMIFlag(MachineInstr::FrameSetup);1811}18121813if (!IsWin64Prologue && !IsFunclet) {1814// Update EBP with the new base value.1815if (!X86FI->hasSwiftAsyncContext())1816BuildMI(MBB, MBBI, DL,1817TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),1818FramePtr)1819.addReg(StackPtr)1820.setMIFlag(MachineInstr::FrameSetup);18211822if (NeedsDwarfCFI) {1823if (ArgBaseReg.isValid()) {1824SmallString<64> CfaExpr;1825CfaExpr.push_back(dwarf::DW_CFA_expression);1826uint8_t buffer[16];1827unsigned DwarfReg = TRI->getDwarfRegNum(MachineFramePtr, true);1828CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));1829CfaExpr.push_back(2);1830CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));1831CfaExpr.push_back(0);1832// DW_CFA_expression: reg5 DW_OP_breg5 +01833BuildCFI(MBB, MBBI, DL,1834MCCFIInstruction::createEscape(nullptr, CfaExpr.str()),1835MachineInstr::FrameSetup);1836} else {1837// Mark effective beginning of when frame pointer becomes valid.1838// Define the current CFA to use the EBP/RBP register.1839unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);1840BuildCFI(1841MBB, MBBI, DL,1842MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr),1843MachineInstr::FrameSetup);1844}1845}18461847if (NeedsWinFPO) {1848// .cv_fpo_setframe $FramePtr1849HasWinCFI = true;1850BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))1851.addImm(FramePtr)1852.addImm(0)1853.setMIFlag(MachineInstr::FrameSetup);1854}1855}1856}1857} else {1858assert(!IsFunclet && "funclets without FPs not yet implemented");1859NumBytes =1860StackSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);1861}18621863// Update the offset adjustment, which is mainly used by codeview to translate1864// from ESP to VFRAME relative local variable offsets.1865if (!IsFunclet) {1866if (HasFP && TRI->hasStackRealignment(MF))1867MFI.setOffsetAdjustment(-NumBytes);1868else1869MFI.setOffsetAdjustment(-StackSize);1870}18711872// For EH funclets, only allocate enough space for outgoing calls. Save the1873// NumBytes value that we would've used for the parent frame.1874unsigned ParentFrameNumBytes = NumBytes;1875if (IsFunclet)1876NumBytes = getWinEHFuncletFrameSize(MF);18771878// Skip the callee-saved push instructions.1879bool PushedRegs = false;1880int StackOffset = 2 * stackGrowth;1881MachineBasicBlock::const_iterator LastCSPush = MBBI;1882auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {1883if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup))1884return false;1885unsigned Opc = MBBI->getOpcode();1886return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||1887Opc == X86::PUSH2 || Opc == X86::PUSH2P;1888};18891890while (IsCSPush(MBBI)) {1891PushedRegs = true;1892Register Reg = MBBI->getOperand(0).getReg();1893LastCSPush = MBBI;1894++MBBI;1895unsigned Opc = LastCSPush->getOpcode();18961897if (!HasFP && NeedsDwarfCFI) {1898// Mark callee-saved push instruction.1899// Define the current CFA rule to use the provided offset.1900assert(StackSize);1901// Compared to push, push2 introduces more stack offset (one more1902// register).1903if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)1904StackOffset += stackGrowth;1905BuildCFI(MBB, MBBI, DL,1906MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),1907MachineInstr::FrameSetup);1908StackOffset += stackGrowth;1909}19101911if (NeedsWinCFI) {1912HasWinCFI = true;1913BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))1914.addImm(Reg)1915.setMIFlag(MachineInstr::FrameSetup);1916if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)1917BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))1918.addImm(LastCSPush->getOperand(1).getReg())1919.setMIFlag(MachineInstr::FrameSetup);1920}1921}19221923// Realign stack after we pushed callee-saved registers (so that we'll be1924// able to calculate their offsets from the frame pointer).1925// Don't do this for Win64, it needs to realign the stack after the prologue.1926if (!IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF) &&1927!ArgBaseReg.isValid()) {1928assert(HasFP && "There should be a frame pointer if stack is realigned.");1929BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);19301931if (NeedsWinCFI) {1932HasWinCFI = true;1933BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlign))1934.addImm(MaxAlign)1935.setMIFlag(MachineInstr::FrameSetup);1936}1937}19381939// If there is an SUB32ri of ESP immediately before this instruction, merge1940// the two. This can be the case when tail call elimination is enabled and1941// the callee has more arguments then the caller.1942NumBytes -= mergeSPUpdates(MBB, MBBI, true);19431944// Adjust stack pointer: ESP -= numbytes.19451946// Windows and cygwin/mingw require a prologue helper routine when allocating1947// more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw1948// uses __alloca. __alloca and the 32-bit version of __chkstk will probe the1949// stack and adjust the stack pointer in one go. The 64-bit version of1950// __chkstk is only responsible for probing the stack. The 64-bit prologue is1951// responsible for adjusting the stack pointer. Touching the stack at 4K1952// increments is necessary to ensure that the guard pages used by the OS1953// virtual memory manager are allocated in correct sequence.1954uint64_t AlignedNumBytes = NumBytes;1955if (IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF))1956AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);1957if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {1958assert(!X86FI->getUsesRedZone() &&1959"The Red Zone is not accounted for in stack probes");19601961// Check whether EAX is livein for this block.1962bool isEAXAlive = isEAXLiveIn(MBB);19631964if (isEAXAlive) {1965if (Is64Bit) {1966// Save RAX1967BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))1968.addReg(X86::RAX, RegState::Kill)1969.setMIFlag(MachineInstr::FrameSetup);1970} else {1971// Save EAX1972BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))1973.addReg(X86::EAX, RegState::Kill)1974.setMIFlag(MachineInstr::FrameSetup);1975}1976}19771978if (Is64Bit) {1979// Handle the 64-bit Windows ABI case where we need to call __chkstk.1980// Function prologue is responsible for adjusting the stack pointer.1981int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;1982BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Alloc)), X86::RAX)1983.addImm(Alloc)1984.setMIFlag(MachineInstr::FrameSetup);1985} else {1986// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.1987// We'll also use 4 already allocated bytes for EAX.1988BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)1989.addImm(isEAXAlive ? NumBytes - 4 : NumBytes)1990.setMIFlag(MachineInstr::FrameSetup);1991}19921993// Call __chkstk, __chkstk_ms, or __alloca.1994emitStackProbe(MF, MBB, MBBI, DL, true);19951996if (isEAXAlive) {1997// Restore RAX/EAX1998MachineInstr *MI;1999if (Is64Bit)2000MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),2001StackPtr, false, NumBytes - 8);2002else2003MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),2004StackPtr, false, NumBytes - 4);2005MI->setFlag(MachineInstr::FrameSetup);2006MBB.insert(MBBI, MI);2007}2008} else if (NumBytes) {2009emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /*InEpilogue=*/false);2010}20112012if (NeedsWinCFI && NumBytes) {2013HasWinCFI = true;2014BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))2015.addImm(NumBytes)2016.setMIFlag(MachineInstr::FrameSetup);2017}20182019int SEHFrameOffset = 0;2020unsigned SPOrEstablisher;2021if (IsFunclet) {2022if (IsClrFunclet) {2023// The establisher parameter passed to a CLR funclet is actually a pointer2024// to the (mostly empty) frame of its nearest enclosing funclet; we have2025// to find the root function establisher frame by loading the PSPSym from2026// the intermediate frame.2027unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);2028MachinePointerInfo NoInfo;2029MBB.addLiveIn(Establisher);2030addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),2031Establisher, false, PSPSlotOffset)2032.addMemOperand(MF.getMachineMemOperand(2033NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize)));2034;2035// Save the root establisher back into the current funclet's (mostly2036// empty) frame, in case a sub-funclet or the GC needs it.2037addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,2038false, PSPSlotOffset)2039.addReg(Establisher)2040.addMemOperand(MF.getMachineMemOperand(2041NoInfo,2042MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,2043SlotSize, Align(SlotSize)));2044}2045SPOrEstablisher = Establisher;2046} else {2047SPOrEstablisher = StackPtr;2048}20492050if (IsWin64Prologue && HasFP) {2051// Set RBP to a small fixed offset from RSP. In the funclet case, we base2052// this calculation on the incoming establisher, which holds the value of2053// RSP from the parent frame at the end of the prologue.2054SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);2055if (SEHFrameOffset)2056addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),2057SPOrEstablisher, false, SEHFrameOffset);2058else2059BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)2060.addReg(SPOrEstablisher);20612062// If this is not a funclet, emit the CFI describing our frame pointer.2063if (NeedsWinCFI && !IsFunclet) {2064assert(!NeedsWinFPO && "this setframe incompatible with FPO data");2065HasWinCFI = true;2066BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))2067.addImm(FramePtr)2068.addImm(SEHFrameOffset)2069.setMIFlag(MachineInstr::FrameSetup);2070if (isAsynchronousEHPersonality(Personality))2071MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;2072}2073} else if (IsFunclet && STI.is32Bit()) {2074// Reset EBP / ESI to something good for funclets.2075MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);2076// If we're a catch funclet, we can be returned to via catchret. Save ESP2077// into the registration node so that the runtime will restore it for us.2078if (!MBB.isCleanupFuncletEntry()) {2079assert(Personality == EHPersonality::MSVC_CXX);2080Register FrameReg;2081int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;2082int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();2083// ESP is the first field, so no extra displacement is needed.2084addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,2085false, EHRegOffset)2086.addReg(X86::ESP);2087}2088}20892090while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {2091const MachineInstr &FrameInstr = *MBBI;2092++MBBI;20932094if (NeedsWinCFI) {2095int FI;2096if (Register Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {2097if (X86::FR64RegClass.contains(Reg)) {2098int Offset;2099Register IgnoredFrameReg;2100if (IsWin64Prologue && IsFunclet)2101Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);2102else2103Offset =2104getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +2105SEHFrameOffset;21062107HasWinCFI = true;2108assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");2109BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))2110.addImm(Reg)2111.addImm(Offset)2112.setMIFlag(MachineInstr::FrameSetup);2113}2114}2115}2116}21172118if (NeedsWinCFI && HasWinCFI)2119BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))2120.setMIFlag(MachineInstr::FrameSetup);21212122if (FnHasClrFunclet && !IsFunclet) {2123// Save the so-called Initial-SP (i.e. the value of the stack pointer2124// immediately after the prolog) into the PSPSlot so that funclets2125// and the GC can recover it.2126unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);2127auto PSPInfo = MachinePointerInfo::getFixedStack(2128MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);2129addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,2130PSPSlotOffset)2131.addReg(StackPtr)2132.addMemOperand(MF.getMachineMemOperand(2133PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,2134SlotSize, Align(SlotSize)));2135}21362137// Realign stack after we spilled callee-saved registers (so that we'll be2138// able to calculate their offsets from the frame pointer).2139// Win64 requires aligning the stack after the prologue.2140if (IsWin64Prologue && TRI->hasStackRealignment(MF)) {2141assert(HasFP && "There should be a frame pointer if stack is realigned.");2142BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);2143}21442145// We already dealt with stack realignment and funclets above.2146if (IsFunclet && STI.is32Bit())2147return;21482149// If we need a base pointer, set it up here. It's whatever the value2150// of the stack pointer is at this point. Any variable size objects2151// will be allocated after this, so we can still use the base pointer2152// to reference locals.2153if (TRI->hasBasePointer(MF)) {2154// Update the base pointer with the current stack pointer.2155unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;2156BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)2157.addReg(SPOrEstablisher)2158.setMIFlag(MachineInstr::FrameSetup);2159if (X86FI->getRestoreBasePointer()) {2160// Stash value of base pointer. Saving RSP instead of EBP shortens2161// dependence chain. Used by SjLj EH.2162unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;2163addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true,2164X86FI->getRestoreBasePointerOffset())2165.addReg(SPOrEstablisher)2166.setMIFlag(MachineInstr::FrameSetup);2167}21682169if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {2170// Stash the value of the frame pointer relative to the base pointer for2171// Win32 EH. This supports Win32 EH, which does the inverse of the above:2172// it recovers the frame pointer from the base pointer rather than the2173// other way around.2174unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;2175Register UsedReg;2176int Offset =2177getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)2178.getFixed();2179assert(UsedReg == BasePtr);2180addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)2181.addReg(FramePtr)2182.setMIFlag(MachineInstr::FrameSetup);2183}2184}2185if (ArgBaseReg.isValid()) {2186// Save argument base pointer.2187auto *MI = X86FI->getStackPtrSaveMI();2188int FI = MI->getOperand(1).getIndex();2189unsigned MOVmr = Is64Bit ? X86::MOV64mr : X86::MOV32mr;2190// movl %basereg, offset(%ebp)2191addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), FI)2192.addReg(ArgBaseReg)2193.setMIFlag(MachineInstr::FrameSetup);2194}21952196if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {2197// Mark end of stack pointer adjustment.2198if (!HasFP && NumBytes) {2199// Define the current CFA rule to use the provided offset.2200assert(StackSize);2201BuildCFI(2202MBB, MBBI, DL,2203MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth),2204MachineInstr::FrameSetup);2205}22062207// Emit DWARF info specifying the offsets of the callee-saved registers.2208emitCalleeSavedFrameMoves(MBB, MBBI, DL, true);2209}22102211// X86 Interrupt handling function cannot assume anything about the direction2212// flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction2213// in each prologue of interrupt handler function.2214//2215// Create "cld" instruction only in these cases:2216// 1. The interrupt handling function uses any of the "rep" instructions.2217// 2. Interrupt handling function calls another function.2218// 3. If there are any inline asm blocks, as we do not know what they do2219//2220// TODO: We should also emit cld if we detect the use of std, but as of now,2221// the compiler does not even emit that instruction or even define it, so in2222// practice, this would only happen with inline asm, which we cover anyway.2223if (Fn.getCallingConv() == CallingConv::X86_INTR) {2224bool NeedsCLD = false;22252226for (const MachineBasicBlock &B : MF) {2227for (const MachineInstr &MI : B) {2228if (MI.isCall()) {2229NeedsCLD = true;2230break;2231}22322233if (isOpcodeRep(MI.getOpcode())) {2234NeedsCLD = true;2235break;2236}22372238if (MI.isInlineAsm()) {2239// TODO: Parse asm for rep instructions or call sites?2240// For now, let's play it safe and emit a cld instruction2241// just in case.2242NeedsCLD = true;2243break;2244}2245}2246}22472248if (NeedsCLD) {2249BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))2250.setMIFlag(MachineInstr::FrameSetup);2251}2252}22532254// At this point we know if the function has WinCFI or not.2255MF.setHasWinCFI(HasWinCFI);2256}22572258bool X86FrameLowering::canUseLEAForSPInEpilogue(2259const MachineFunction &MF) const {2260// We can't use LEA instructions for adjusting the stack pointer if we don't2261// have a frame pointer in the Win64 ABI. Only ADD instructions may be used2262// to deallocate the stack.2263// This means that we can use LEA for SP in two situations:2264// 1. We *aren't* using the Win64 ABI which means we are free to use LEA.2265// 2. We *have* a frame pointer which means we are permitted to use LEA.2266return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);2267}22682269static bool isFuncletReturnInstr(MachineInstr &MI) {2270switch (MI.getOpcode()) {2271case X86::CATCHRET:2272case X86::CLEANUPRET:2273return true;2274default:2275return false;2276}2277llvm_unreachable("impossible");2278}22792280// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the2281// stack. It holds a pointer to the bottom of the root function frame. The2282// establisher frame pointer passed to a nested funclet may point to the2283// (mostly empty) frame of its parent funclet, but it will need to find2284// the frame of the root function to access locals. To facilitate this,2285// every funclet copies the pointer to the bottom of the root function2286// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the2287// same offset for the PSPSym in the root function frame that's used in the2288// funclets' frames allows each funclet to dynamically accept any ancestor2289// frame as its establisher argument (the runtime doesn't guarantee the2290// immediate parent for some reason lost to history), and also allows the GC,2291// which uses the PSPSym for some bookkeeping, to find it in any funclet's2292// frame with only a single offset reported for the entire method.2293unsigned2294X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {2295const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();2296Register SPReg;2297int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,2298/*IgnoreSPUpdates*/ true)2299.getFixed();2300assert(Offset >= 0 && SPReg == TRI->getStackRegister());2301return static_cast<unsigned>(Offset);2302}23032304unsigned2305X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {2306const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();2307// This is the size of the pushed CSRs.2308unsigned CSSize = X86FI->getCalleeSavedFrameSize();2309// This is the size of callee saved XMMs.2310const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();2311unsigned XMMSize =2312WinEHXMMSlotInfo.size() * TRI->getSpillSize(X86::VR128RegClass);2313// This is the amount of stack a funclet needs to allocate.2314unsigned UsedSize;2315EHPersonality Personality =2316classifyEHPersonality(MF.getFunction().getPersonalityFn());2317if (Personality == EHPersonality::CoreCLR) {2318// CLR funclets need to hold enough space to include the PSPSym, at the2319// same offset from the stack pointer (immediately after the prolog) as it2320// resides at in the main function.2321UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;2322} else {2323// Other funclets just need enough stack for outgoing call arguments.2324UsedSize = MF.getFrameInfo().getMaxCallFrameSize();2325}2326// RBP is not included in the callee saved register block. After pushing RBP,2327// everything is 16 byte aligned. Everything we allocate before an outgoing2328// call must also be 16 byte aligned.2329unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign());2330// Subtract out the size of the callee saved registers. This is how much stack2331// each funclet will allocate.2332return FrameSizeMinusRBP + XMMSize - CSSize;2333}23342335static bool isTailCallOpcode(unsigned Opc) {2336return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||2337Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||2338Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64;2339}23402341void X86FrameLowering::emitEpilogue(MachineFunction &MF,2342MachineBasicBlock &MBB) const {2343const MachineFrameInfo &MFI = MF.getFrameInfo();2344X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();2345MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();2346MachineBasicBlock::iterator MBBI = Terminator;2347DebugLoc DL;2348if (MBBI != MBB.end())2349DL = MBBI->getDebugLoc();2350// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.2351const bool Is64BitILP32 = STI.isTarget64BitILP32();2352Register FramePtr = TRI->getFrameRegister(MF);2353Register MachineFramePtr =2354Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;23552356bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();2357bool NeedsWin64CFI =2358IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();2359bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);23602361// Get the number of bytes to allocate from the FrameInfo.2362uint64_t StackSize = MFI.getStackSize();2363uint64_t MaxAlign = calculateMaxStackAlign(MF);2364unsigned CSSize = X86FI->getCalleeSavedFrameSize();2365unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();2366bool HasFP = hasFP(MF);2367uint64_t NumBytes = 0;23682369bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() &&2370!MF.getTarget().getTargetTriple().isOSWindows()) &&2371MF.needsFrameMoves();23722373Register ArgBaseReg;2374if (auto *MI = X86FI->getStackPtrSaveMI()) {2375unsigned Opc = X86::LEA32r;2376Register StackReg = X86::ESP;2377ArgBaseReg = MI->getOperand(0).getReg();2378if (STI.is64Bit()) {2379Opc = X86::LEA64r;2380StackReg = X86::RSP;2381}2382// leal -4(%basereg), %esp2383// .cfi_def_cfa %esp, 42384BuildMI(MBB, MBBI, DL, TII.get(Opc), StackReg)2385.addUse(ArgBaseReg)2386.addImm(1)2387.addUse(X86::NoRegister)2388.addImm(-(int64_t)SlotSize)2389.addUse(X86::NoRegister)2390.setMIFlag(MachineInstr::FrameDestroy);2391if (NeedsDwarfCFI) {2392unsigned DwarfStackPtr = TRI->getDwarfRegNum(StackReg, true);2393BuildCFI(MBB, MBBI, DL,2394MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),2395MachineInstr::FrameDestroy);2396--MBBI;2397}2398--MBBI;2399}24002401if (IsFunclet) {2402assert(HasFP && "EH funclets without FP not yet implemented");2403NumBytes = getWinEHFuncletFrameSize(MF);2404} else if (HasFP) {2405// Calculate required stack adjustment.2406uint64_t FrameSize = StackSize - SlotSize;2407NumBytes = FrameSize - CSSize - TailCallArgReserveSize;24082409// Callee-saved registers were pushed on stack before the stack was2410// realigned.2411if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)2412NumBytes = alignTo(FrameSize, MaxAlign);2413} else {2414NumBytes = StackSize - CSSize - TailCallArgReserveSize;2415}2416uint64_t SEHStackAllocAmt = NumBytes;24172418// AfterPop is the position to insert .cfi_restore.2419MachineBasicBlock::iterator AfterPop = MBBI;2420if (HasFP) {2421if (X86FI->hasSwiftAsyncContext()) {2422// Discard the context.2423int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);2424emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);2425}2426// Pop EBP.2427BuildMI(MBB, MBBI, DL,2428TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),2429MachineFramePtr)2430.setMIFlag(MachineInstr::FrameDestroy);24312432// We need to reset FP to its untagged state on return. Bit 60 is currently2433// used to show the presence of an extended frame.2434if (X86FI->hasSwiftAsyncContext()) {2435BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8), MachineFramePtr)2436.addUse(MachineFramePtr)2437.addImm(60)2438.setMIFlag(MachineInstr::FrameDestroy);2439}24402441if (NeedsDwarfCFI) {2442if (!ArgBaseReg.isValid()) {2443unsigned DwarfStackPtr =2444TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);2445BuildCFI(MBB, MBBI, DL,2446MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),2447MachineInstr::FrameDestroy);2448}2449if (!MBB.succ_empty() && !MBB.isReturnBlock()) {2450unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);2451BuildCFI(MBB, AfterPop, DL,2452MCCFIInstruction::createRestore(nullptr, DwarfFramePtr),2453MachineInstr::FrameDestroy);2454--MBBI;2455--AfterPop;2456}2457--MBBI;2458}2459}24602461MachineBasicBlock::iterator FirstCSPop = MBBI;2462// Skip the callee-saved pop instructions.2463while (MBBI != MBB.begin()) {2464MachineBasicBlock::iterator PI = std::prev(MBBI);2465unsigned Opc = PI->getOpcode();24662467if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {2468if (!PI->getFlag(MachineInstr::FrameDestroy) ||2469(Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&2470Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&2471Opc != X86::POP2P && Opc != X86::LEA64r))2472break;2473FirstCSPop = PI;2474}24752476--MBBI;2477}2478if (ArgBaseReg.isValid()) {2479// Restore argument base pointer.2480auto *MI = X86FI->getStackPtrSaveMI();2481int FI = MI->getOperand(1).getIndex();2482unsigned MOVrm = Is64Bit ? X86::MOV64rm : X86::MOV32rm;2483// movl offset(%ebp), %basereg2484addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVrm), ArgBaseReg), FI)2485.setMIFlag(MachineInstr::FrameDestroy);2486}2487MBBI = FirstCSPop;24882489if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)2490emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator);24912492if (MBBI != MBB.end())2493DL = MBBI->getDebugLoc();2494// If there is an ADD32ri or SUB32ri of ESP immediately before this2495// instruction, merge the two instructions.2496if (NumBytes || MFI.hasVarSizedObjects())2497NumBytes += mergeSPUpdates(MBB, MBBI, true);24982499// If dynamic alloca is used, then reset esp to point to the last callee-saved2500// slot before popping them off! Same applies for the case, when stack was2501// realigned. Don't do this if this was a funclet epilogue, since the funclets2502// will not do realignment or dynamic stack allocation.2503if (((TRI->hasStackRealignment(MF)) || MFI.hasVarSizedObjects()) &&2504!IsFunclet) {2505if (TRI->hasStackRealignment(MF))2506MBBI = FirstCSPop;2507unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);2508uint64_t LEAAmount =2509IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;25102511if (X86FI->hasSwiftAsyncContext())2512LEAAmount -= 16;25132514// There are only two legal forms of epilogue:2515// - add SEHAllocationSize, %rsp2516// - lea SEHAllocationSize(%FramePtr), %rsp2517//2518// 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.2519// However, we may use this sequence if we have a frame pointer because the2520// effects of the prologue can safely be undone.2521if (LEAAmount != 0) {2522unsigned Opc = getLEArOpcode(Uses64BitFramePtr);2523addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr,2524false, LEAAmount);2525--MBBI;2526} else {2527unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);2528BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr).addReg(FramePtr);2529--MBBI;2530}2531} else if (NumBytes) {2532// Adjust stack pointer back: ESP += numbytes.2533emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);2534if (!HasFP && NeedsDwarfCFI) {2535// Define the current CFA rule to use the provided offset.2536BuildCFI(MBB, MBBI, DL,2537MCCFIInstruction::cfiDefCfaOffset(2538nullptr, CSSize + TailCallArgReserveSize + SlotSize),2539MachineInstr::FrameDestroy);2540}2541--MBBI;2542}25432544// Windows unwinder will not invoke function's exception handler if IP is2545// either in prologue or in epilogue. This behavior causes a problem when a2546// call immediately precedes an epilogue, because the return address points2547// into the epilogue. To cope with that, we insert an epilogue marker here,2548// then replace it with a 'nop' if it ends up immediately after a CALL in the2549// final emitted code.2550if (NeedsWin64CFI && MF.hasWinCFI())2551BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));25522553if (!HasFP && NeedsDwarfCFI) {2554MBBI = FirstCSPop;2555int64_t Offset = -(int64_t)CSSize - SlotSize;2556// Mark callee-saved pop instruction.2557// Define the current CFA rule to use the provided offset.2558while (MBBI != MBB.end()) {2559MachineBasicBlock::iterator PI = MBBI;2560unsigned Opc = PI->getOpcode();2561++MBBI;2562if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||2563Opc == X86::POP2 || Opc == X86::POP2P) {2564Offset += SlotSize;2565// Compared to pop, pop2 introduces more stack offset (one more2566// register).2567if (Opc == X86::POP2 || Opc == X86::POP2P)2568Offset += SlotSize;2569BuildCFI(MBB, MBBI, DL,2570MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),2571MachineInstr::FrameDestroy);2572}2573}2574}25752576// Emit DWARF info specifying the restores of the callee-saved registers.2577// For epilogue with return inside or being other block without successor,2578// no need to generate .cfi_restore for callee-saved registers.2579if (NeedsDwarfCFI && !MBB.succ_empty())2580emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false);25812582if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {2583// Add the return addr area delta back since we are not tail calling.2584int Offset = -1 * X86FI->getTCReturnAddrDelta();2585assert(Offset >= 0 && "TCDelta should never be positive");2586if (Offset) {2587// Check for possible merge with preceding ADD instruction.2588Offset += mergeSPUpdates(MBB, Terminator, true);2589emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);2590}2591}25922593// Emit tilerelease for AMX kernel.2594if (X86FI->getAMXProgModel() == AMXProgModelEnum::ManagedRA)2595BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));2596}25972598StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,2599int FI,2600Register &FrameReg) const {2601const MachineFrameInfo &MFI = MF.getFrameInfo();26022603bool IsFixed = MFI.isFixedObjectIndex(FI);2604// We can't calculate offset from frame pointer if the stack is realigned,2605// so enforce usage of stack/base pointer. The base pointer is used when we2606// have dynamic allocas in addition to dynamic realignment.2607if (TRI->hasBasePointer(MF))2608FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();2609else if (TRI->hasStackRealignment(MF))2610FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();2611else2612FrameReg = TRI->getFrameRegister(MF);26132614// Offset will hold the offset from the stack pointer at function entry to the2615// object.2616// We need to factor in additional offsets applied during the prologue to the2617// frame, base, and stack pointer depending on which is used.2618int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();2619const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();2620unsigned CSSize = X86FI->getCalleeSavedFrameSize();2621uint64_t StackSize = MFI.getStackSize();2622bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();2623int64_t FPDelta = 0;26242625// In an x86 interrupt, remove the offset we added to account for the return2626// address from any stack object allocated in the caller's frame. Interrupts2627// do not have a standard return address. Fixed objects in the current frame,2628// such as SSE register spills, should not get this treatment.2629if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&2630Offset >= 0) {2631Offset += getOffsetOfLocalArea();2632}26332634if (IsWin64Prologue) {2635assert(!MFI.hasCalls() || (StackSize % 16) == 8);26362637// Calculate required stack adjustment.2638uint64_t FrameSize = StackSize - SlotSize;2639// If required, include space for extra hidden slot for stashing base2640// pointer.2641if (X86FI->getRestoreBasePointer())2642FrameSize += SlotSize;2643uint64_t NumBytes = FrameSize - CSSize;26442645uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);2646if (FI && FI == X86FI->getFAIndex())2647return StackOffset::getFixed(-SEHFrameOffset);26482649// FPDelta is the offset from the "traditional" FP location of the old base2650// pointer followed by return address and the location required by the2651// restricted Win64 prologue.2652// Add FPDelta to all offsets below that go through the frame pointer.2653FPDelta = FrameSize - SEHFrameOffset;2654assert((!MFI.hasCalls() || (FPDelta % 16) == 0) &&2655"FPDelta isn't aligned per the Win64 ABI!");2656}26572658if (FrameReg == TRI->getFramePtr()) {2659// Skip saved EBP/RBP2660Offset += SlotSize;26612662// Account for restricted Windows prologue.2663Offset += FPDelta;26642665// Skip the RETADDR move area2666int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();2667if (TailCallReturnAddrDelta < 0)2668Offset -= TailCallReturnAddrDelta;26692670return StackOffset::getFixed(Offset);2671}26722673// FrameReg is either the stack pointer or a base pointer. But the base is2674// located at the end of the statically known StackSize so the distinction2675// doesn't really matter.2676if (TRI->hasStackRealignment(MF) || TRI->hasBasePointer(MF))2677assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));2678return StackOffset::getFixed(Offset + StackSize);2679}26802681int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,2682Register &FrameReg) const {2683const MachineFrameInfo &MFI = MF.getFrameInfo();2684const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();2685const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();2686const auto it = WinEHXMMSlotInfo.find(FI);26872688if (it == WinEHXMMSlotInfo.end())2689return getFrameIndexReference(MF, FI, FrameReg).getFixed();26902691FrameReg = TRI->getStackRegister();2692return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +2693it->second;2694}26952696StackOffset2697X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,2698Register &FrameReg,2699int Adjustment) const {2700const MachineFrameInfo &MFI = MF.getFrameInfo();2701FrameReg = TRI->getStackRegister();2702return StackOffset::getFixed(MFI.getObjectOffset(FI) -2703getOffsetOfLocalArea() + Adjustment);2704}27052706StackOffset2707X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,2708int FI, Register &FrameReg,2709bool IgnoreSPUpdates) const {27102711const MachineFrameInfo &MFI = MF.getFrameInfo();2712// Does not include any dynamic realign.2713const uint64_t StackSize = MFI.getStackSize();2714// LLVM arranges the stack as follows:2715// ...2716// ARG22717// ARG12718// RETADDR2719// PUSH RBP <-- RBP points here2720// PUSH CSRs2721// ~~~~~~~ <-- possible stack realignment (non-win64)2722// ...2723// STACK OBJECTS2724// ... <-- RSP after prologue points here2725// ~~~~~~~ <-- possible stack realignment (win64)2726//2727// if (hasVarSizedObjects()):2728// ... <-- "base pointer" (ESI/RBX) points here2729// DYNAMIC ALLOCAS2730// ... <-- RSP points here2731//2732// Case 1: In the simple case of no stack realignment and no dynamic2733// allocas, both "fixed" stack objects (arguments and CSRs) are addressable2734// with fixed offsets from RSP.2735//2736// Case 2: In the case of stack realignment with no dynamic allocas, fixed2737// stack objects are addressed with RBP and regular stack objects with RSP.2738//2739// Case 3: In the case of dynamic allocas and stack realignment, RSP is used2740// to address stack arguments for outgoing calls and nothing else. The "base2741// pointer" points to local variables, and RBP points to fixed objects.2742//2743// In cases 2 and 3, we can only answer for non-fixed stack objects, and the2744// answer we give is relative to the SP after the prologue, and not the2745// SP in the middle of the function.27462747if (MFI.isFixedObjectIndex(FI) && TRI->hasStackRealignment(MF) &&2748!STI.isTargetWin64())2749return getFrameIndexReference(MF, FI, FrameReg);27502751// If !hasReservedCallFrame the function might have SP adjustement in the2752// body. So, even though the offset is statically known, it depends on where2753// we are in the function.2754if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))2755return getFrameIndexReference(MF, FI, FrameReg);27562757// We don't handle tail calls, and shouldn't be seeing them either.2758assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&2759"we don't handle this case!");27602761// This is how the math works out:2762//2763// %rsp grows (i.e. gets lower) left to right. Each box below is2764// one word (eight bytes). Obj0 is the stack slot we're trying to2765// get to.2766//2767// ----------------------------------2768// | BP | Obj0 | Obj1 | ... | ObjN |2769// ----------------------------------2770// ^ ^ ^ ^2771// A B C E2772//2773// A is the incoming stack pointer.2774// (B - A) is the local area offset (-8 for x86-64) [1]2775// (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]2776//2777// |(E - B)| is the StackSize (absolute value, positive). For a2778// stack that grown down, this works out to be (B - E). [3]2779//2780// E is also the value of %rsp after stack has been set up, and we2781// want (C - E) -- the value we can add to %rsp to get to Obj0. Now2782// (C - E) == (C - A) - (B - A) + (B - E)2783// { Using [1], [2] and [3] above }2784// == getObjectOffset - LocalAreaOffset + StackSize27852786return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);2787}27882789bool X86FrameLowering::assignCalleeSavedSpillSlots(2790MachineFunction &MF, const TargetRegisterInfo *TRI,2791std::vector<CalleeSavedInfo> &CSI) const {2792MachineFrameInfo &MFI = MF.getFrameInfo();2793X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();27942795unsigned CalleeSavedFrameSize = 0;2796unsigned XMMCalleeSavedFrameSize = 0;2797auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();2798int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();27992800int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();28012802if (TailCallReturnAddrDelta < 0) {2803// create RETURNADDR area2804// arg2805// arg2806// RETADDR2807// { ...2808// RETADDR area2809// ...2810// }2811// [EBP]2812MFI.CreateFixedObject(-TailCallReturnAddrDelta,2813TailCallReturnAddrDelta - SlotSize, true);2814}28152816// Spill the BasePtr if it's used.2817if (this->TRI->hasBasePointer(MF)) {2818// Allocate a spill slot for EBP if we have a base pointer and EH funclets.2819if (MF.hasEHFunclets()) {2820int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize));2821X86FI->setHasSEHFramePtrSave(true);2822X86FI->setSEHFramePtrSaveIndex(FI);2823}2824}28252826if (hasFP(MF)) {2827// emitPrologue always spills frame register the first thing.2828SpillSlotOffset -= SlotSize;2829MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);28302831// The async context lives directly before the frame pointer, and we2832// allocate a second slot to preserve stack alignment.2833if (X86FI->hasSwiftAsyncContext()) {2834SpillSlotOffset -= SlotSize;2835MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);2836SpillSlotOffset -= SlotSize;2837}28382839// Since emitPrologue and emitEpilogue will handle spilling and restoring of2840// the frame register, we can delete it from CSI list and not have to worry2841// about avoiding it later.2842Register FPReg = TRI->getFrameRegister(MF);2843for (unsigned i = 0; i < CSI.size(); ++i) {2844if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) {2845CSI.erase(CSI.begin() + i);2846break;2847}2848}2849}28502851// Strategy:2852// 1. Use push2 when2853// a) number of CSR > 1 if no need padding2854// b) number of CSR > 2 if need padding2855// 2. When the number of CSR push is odd2856// a. Start to use push2 from the 1st push if stack is 16B aligned.2857// b. Start to use push2 from the 2nd push if stack is not 16B aligned.2858// 3. When the number of CSR push is even, start to use push2 from the 1st2859// push and make the stack 16B aligned before the push2860unsigned NumRegsForPush2 = 0;2861if (STI.hasPush2Pop2()) {2862unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {2863return X86::GR64RegClass.contains(I.getReg());2864});2865bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0);2866bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1;2867X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);2868NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0;2869if (X86FI->padForPush2Pop2()) {2870SpillSlotOffset -= SlotSize;2871MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);2872}2873}28742875// Assign slots for GPRs. It increases frame size.2876for (CalleeSavedInfo &I : llvm::reverse(CSI)) {2877Register Reg = I.getReg();28782879if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))2880continue;28812882// A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned2883// or only an odd number of registers in the candidates.2884if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&2885(SpillSlotOffset % 16 == 0 ||2886X86FI->getNumCandidatesForPush2Pop2() % 2))2887X86FI->addCandidateForPush2Pop2(Reg);28882889SpillSlotOffset -= SlotSize;2890CalleeSavedFrameSize += SlotSize;28912892int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);2893I.setFrameIdx(SlotIndex);2894}28952896// Adjust the offset of spill slot as we know the accurate callee saved frame2897// size.2898if (X86FI->getRestoreBasePointer()) {2899SpillSlotOffset -= SlotSize;2900CalleeSavedFrameSize += SlotSize;29012902MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);2903// TODO: saving the slot index is better?2904X86FI->setRestoreBasePointer(CalleeSavedFrameSize);2905}2906assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 &&2907"Expect even candidates for push2/pop2");2908if (X86FI->getNumCandidatesForPush2Pop2())2909++NumFunctionUsingPush2Pop2;2910X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);2911MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);29122913// Assign slots for XMMs.2914for (CalleeSavedInfo &I : llvm::reverse(CSI)) {2915Register Reg = I.getReg();2916if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))2917continue;29182919// If this is k-register make sure we lookup via the largest legal type.2920MVT VT = MVT::Other;2921if (X86::VK16RegClass.contains(Reg))2922VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;29232924const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);2925unsigned Size = TRI->getSpillSize(*RC);2926Align Alignment = TRI->getSpillAlign(*RC);2927// ensure alignment2928assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");2929SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment);29302931// spill into slot2932SpillSlotOffset -= Size;2933int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);2934I.setFrameIdx(SlotIndex);2935MFI.ensureMaxAlignment(Alignment);29362937// Save the start offset and size of XMM in stack frame for funclets.2938if (X86::VR128RegClass.contains(Reg)) {2939WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;2940XMMCalleeSavedFrameSize += Size;2941}2942}29432944return true;2945}29462947bool X86FrameLowering::spillCalleeSavedRegisters(2948MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,2949ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {2950DebugLoc DL = MBB.findDebugLoc(MI);29512952// Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI2953// for us, and there are no XMM CSRs on Win32.2954if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())2955return true;29562957// Push GPRs. It increases frame size.2958const MachineFunction &MF = *MBB.getParent();2959const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();2960if (X86FI->padForPush2Pop2())2961emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false);29622963// Update LiveIn of the basic block and decide whether we can add a kill flag2964// to the use.2965auto UpdateLiveInCheckCanKill = [&](Register Reg) {2966const MachineRegisterInfo &MRI = MF.getRegInfo();2967// Do not set a kill flag on values that are also marked as live-in. This2968// happens with the @llvm-returnaddress intrinsic and with arguments2969// passed in callee saved registers.2970// Omitting the kill flags is conservatively correct even if the live-in2971// is not used after all.2972if (MRI.isLiveIn(Reg))2973return false;2974MBB.addLiveIn(Reg);2975// Check if any subregister is live-in2976for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)2977if (MRI.isLiveIn(*AReg))2978return false;2979return true;2980};2981auto UpdateLiveInGetKillRegState = [&](Register Reg) {2982return getKillRegState(UpdateLiveInCheckCanKill(Reg));2983};29842985for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {2986Register Reg = RI->getReg();2987if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))2988continue;29892990if (X86FI->isCandidateForPush2Pop2(Reg)) {2991Register Reg2 = (++RI)->getReg();2992BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI)))2993.addReg(Reg, UpdateLiveInGetKillRegState(Reg))2994.addReg(Reg2, UpdateLiveInGetKillRegState(Reg2))2995.setMIFlag(MachineInstr::FrameSetup);2996} else {2997BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI)))2998.addReg(Reg, UpdateLiveInGetKillRegState(Reg))2999.setMIFlag(MachineInstr::FrameSetup);3000}3001}30023003if (X86FI->getRestoreBasePointer()) {3004unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;3005Register BaseReg = this->TRI->getBaseRegister();3006BuildMI(MBB, MI, DL, TII.get(Opc))3007.addReg(BaseReg, getKillRegState(true))3008.setMIFlag(MachineInstr::FrameSetup);3009}30103011// Make XMM regs spilled. X86 does not have ability of push/pop XMM.3012// It can be done by spilling XMMs to stack frame.3013for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {3014Register Reg = I.getReg();3015if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))3016continue;30173018// If this is k-register make sure we lookup via the largest legal type.3019MVT VT = MVT::Other;3020if (X86::VK16RegClass.contains(Reg))3021VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;30223023// Add the callee-saved register as live-in. It's killed at the spill.3024MBB.addLiveIn(Reg);3025const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);30263027TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,3028Register());3029--MI;3030MI->setFlag(MachineInstr::FrameSetup);3031++MI;3032}30333034return true;3035}30363037void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,3038MachineBasicBlock::iterator MBBI,3039MachineInstr *CatchRet) const {3040// SEH shouldn't use catchret.3041assert(!isAsynchronousEHPersonality(classifyEHPersonality(3042MBB.getParent()->getFunction().getPersonalityFn())) &&3043"SEH should not use CATCHRET");3044const DebugLoc &DL = CatchRet->getDebugLoc();3045MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();30463047// Fill EAX/RAX with the address of the target block.3048if (STI.is64Bit()) {3049// LEA64r CatchRetTarget(%rip), %rax3050BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)3051.addReg(X86::RIP)3052.addImm(0)3053.addReg(0)3054.addMBB(CatchRetTarget)3055.addReg(0);3056} else {3057// MOV32ri $CatchRetTarget, %eax3058BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)3059.addMBB(CatchRetTarget);3060}30613062// Record that we've taken the address of CatchRetTarget and no longer just3063// reference it in a terminator.3064CatchRetTarget->setMachineBlockAddressTaken();3065}30663067bool X86FrameLowering::restoreCalleeSavedRegisters(3068MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,3069MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {3070if (CSI.empty())3071return false;30723073if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {3074// Don't restore CSRs in 32-bit EH funclets. Matches3075// spillCalleeSavedRegisters.3076if (STI.is32Bit())3077return true;3078// Don't restore CSRs before an SEH catchret. SEH except blocks do not form3079// funclets. emitEpilogue transforms these to normal jumps.3080if (MI->getOpcode() == X86::CATCHRET) {3081const Function &F = MBB.getParent()->getFunction();3082bool IsSEH = isAsynchronousEHPersonality(3083classifyEHPersonality(F.getPersonalityFn()));3084if (IsSEH)3085return true;3086}3087}30883089DebugLoc DL = MBB.findDebugLoc(MI);30903091// Reload XMMs from stack frame.3092for (const CalleeSavedInfo &I : CSI) {3093Register Reg = I.getReg();3094if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))3095continue;30963097// If this is k-register make sure we lookup via the largest legal type.3098MVT VT = MVT::Other;3099if (X86::VK16RegClass.contains(Reg))3100VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;31013102const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);3103TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI,3104Register());3105}31063107// Clear the stack slot for spill base pointer register.3108MachineFunction &MF = *MBB.getParent();3109const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();3110if (X86FI->getRestoreBasePointer()) {3111unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;3112Register BaseReg = this->TRI->getBaseRegister();3113BuildMI(MBB, MI, DL, TII.get(Opc), BaseReg)3114.setMIFlag(MachineInstr::FrameDestroy);3115}31163117// POP GPRs.3118for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {3119Register Reg = I->getReg();3120if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))3121continue;31223123if (X86FI->isCandidateForPush2Pop2(Reg))3124BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg)3125.addReg((++I)->getReg(), RegState::Define)3126.setMIFlag(MachineInstr::FrameDestroy);3127else3128BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg)3129.setMIFlag(MachineInstr::FrameDestroy);3130}3131if (X86FI->padForPush2Pop2())3132emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true);31333134return true;3135}31363137void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,3138BitVector &SavedRegs,3139RegScavenger *RS) const {3140TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);31413142// Spill the BasePtr if it's used.3143if (TRI->hasBasePointer(MF)) {3144Register BasePtr = TRI->getBaseRegister();3145if (STI.isTarget64BitILP32())3146BasePtr = getX86SubSuperRegister(BasePtr, 64);3147SavedRegs.set(BasePtr);3148}3149}31503151static bool HasNestArgument(const MachineFunction *MF) {3152const Function &F = MF->getFunction();3153for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;3154I++) {3155if (I->hasNestAttr() && !I->use_empty())3156return true;3157}3158return false;3159}31603161/// GetScratchRegister - Get a temp register for performing work in the3162/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform3163/// and the properties of the function either one or two registers will be3164/// needed. Set primary to true for the first register, false for the second.3165static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64,3166const MachineFunction &MF, bool Primary) {3167CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();31683169// Erlang stuff.3170if (CallingConvention == CallingConv::HiPE) {3171if (Is64Bit)3172return Primary ? X86::R14 : X86::R13;3173else3174return Primary ? X86::EBX : X86::EDI;3175}31763177if (Is64Bit) {3178if (IsLP64)3179return Primary ? X86::R11 : X86::R12;3180else3181return Primary ? X86::R11D : X86::R12D;3182}31833184bool IsNested = HasNestArgument(&MF);31853186if (CallingConvention == CallingConv::X86_FastCall ||3187CallingConvention == CallingConv::Fast ||3188CallingConvention == CallingConv::Tail) {3189if (IsNested)3190report_fatal_error("Segmented stacks does not support fastcall with "3191"nested function.");3192return Primary ? X86::EAX : X86::ECX;3193}3194if (IsNested)3195return Primary ? X86::EDX : X86::EAX;3196return Primary ? X86::ECX : X86::EAX;3197}31983199// The stack limit in the TCB is set to this many bytes above the actual stack3200// limit.3201static const uint64_t kSplitStackAvailable = 256;32023203void X86FrameLowering::adjustForSegmentedStacks(3204MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {3205MachineFrameInfo &MFI = MF.getFrameInfo();3206uint64_t StackSize;3207unsigned TlsReg, TlsOffset;3208DebugLoc DL;32093210// To support shrink-wrapping we would need to insert the new blocks3211// at the right place and update the branches to PrologueMBB.3212assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");32133214unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);3215assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&3216"Scratch register is live-in");32173218if (MF.getFunction().isVarArg())3219report_fatal_error("Segmented stacks do not support vararg functions.");3220if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&3221!STI.isTargetWin64() && !STI.isTargetFreeBSD() &&3222!STI.isTargetDragonFly())3223report_fatal_error("Segmented stacks not supported on this platform.");32243225// Eventually StackSize will be calculated by a link-time pass; which will3226// also decide whether checking code needs to be injected into this particular3227// prologue.3228StackSize = MFI.getStackSize();32293230if (!MFI.needsSplitStackProlog())3231return;32323233MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();3234MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();3235X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();3236bool IsNested = false;32373238// We need to know if the function has a nest argument only in 64 bit mode.3239if (Is64Bit)3240IsNested = HasNestArgument(&MF);32413242// The MOV R10, RAX needs to be in a different block, since the RET we emit in3243// allocMBB needs to be last (terminating) instruction.32443245for (const auto &LI : PrologueMBB.liveins()) {3246allocMBB->addLiveIn(LI);3247checkMBB->addLiveIn(LI);3248}32493250if (IsNested)3251allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);32523253MF.push_front(allocMBB);3254MF.push_front(checkMBB);32553256// When the frame size is less than 256 we just compare the stack3257// boundary directly to the value of the stack pointer, per gcc.3258bool CompareStackPointer = StackSize < kSplitStackAvailable;32593260// Read the limit off the current stacklet off the stack_guard location.3261if (Is64Bit) {3262if (STI.isTargetLinux()) {3263TlsReg = X86::FS;3264TlsOffset = IsLP64 ? 0x70 : 0x40;3265} else if (STI.isTargetDarwin()) {3266TlsReg = X86::GS;3267TlsOffset = 0x60 + 90 * 8; // See pthread_machdep.h. Steal TLS slot 90.3268} else if (STI.isTargetWin64()) {3269TlsReg = X86::GS;3270TlsOffset = 0x28; // pvArbitrary, reserved for application use3271} else if (STI.isTargetFreeBSD()) {3272TlsReg = X86::FS;3273TlsOffset = 0x18;3274} else if (STI.isTargetDragonFly()) {3275TlsReg = X86::FS;3276TlsOffset = 0x20; // use tls_tcb.tcb_segstack3277} else {3278report_fatal_error("Segmented stacks not supported on this platform.");3279}32803281if (CompareStackPointer)3282ScratchReg = IsLP64 ? X86::RSP : X86::ESP;3283else3284BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r),3285ScratchReg)3286.addReg(X86::RSP)3287.addImm(1)3288.addReg(0)3289.addImm(-StackSize)3290.addReg(0);32913292BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm))3293.addReg(ScratchReg)3294.addReg(0)3295.addImm(1)3296.addReg(0)3297.addImm(TlsOffset)3298.addReg(TlsReg);3299} else {3300if (STI.isTargetLinux()) {3301TlsReg = X86::GS;3302TlsOffset = 0x30;3303} else if (STI.isTargetDarwin()) {3304TlsReg = X86::GS;3305TlsOffset = 0x48 + 90 * 4;3306} else if (STI.isTargetWin32()) {3307TlsReg = X86::FS;3308TlsOffset = 0x14; // pvArbitrary, reserved for application use3309} else if (STI.isTargetDragonFly()) {3310TlsReg = X86::FS;3311TlsOffset = 0x10; // use tls_tcb.tcb_segstack3312} else if (STI.isTargetFreeBSD()) {3313report_fatal_error("Segmented stacks not supported on FreeBSD i386.");3314} else {3315report_fatal_error("Segmented stacks not supported on this platform.");3316}33173318if (CompareStackPointer)3319ScratchReg = X86::ESP;3320else3321BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg)3322.addReg(X86::ESP)3323.addImm(1)3324.addReg(0)3325.addImm(-StackSize)3326.addReg(0);33273328if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||3329STI.isTargetDragonFly()) {3330BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))3331.addReg(ScratchReg)3332.addReg(0)3333.addImm(0)3334.addReg(0)3335.addImm(TlsOffset)3336.addReg(TlsReg);3337} else if (STI.isTargetDarwin()) {33383339// TlsOffset doesn't fit into a mod r/m byte so we need an extra register.3340unsigned ScratchReg2;3341bool SaveScratch2;3342if (CompareStackPointer) {3343// The primary scratch register is available for holding the TLS offset.3344ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);3345SaveScratch2 = false;3346} else {3347// Need to use a second register to hold the TLS offset3348ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);33493350// Unfortunately, with fastcc the second scratch register may hold an3351// argument.3352SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);3353}33543355// If Scratch2 is live-in then it needs to be saved.3356assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&3357"Scratch register is live-in and not saved");33583359if (SaveScratch2)3360BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))3361.addReg(ScratchReg2, RegState::Kill);33623363BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)3364.addImm(TlsOffset);3365BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))3366.addReg(ScratchReg)3367.addReg(ScratchReg2)3368.addImm(1)3369.addReg(0)3370.addImm(0)3371.addReg(TlsReg);33723373if (SaveScratch2)3374BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);3375}3376}33773378// This jump is taken if SP >= (Stacklet Limit + Stack Space required).3379// It jumps to normal execution of the function body.3380BuildMI(checkMBB, DL, TII.get(X86::JCC_1))3381.addMBB(&PrologueMBB)3382.addImm(X86::COND_A);33833384// On 32 bit we first push the arguments size and then the frame size. On 643385// bit, we pass the stack frame size in r10 and the argument size in r11.3386if (Is64Bit) {3387// Functions with nested arguments use R10, so it needs to be saved across3388// the call to _morestack33893390const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;3391const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;3392const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;3393const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;33943395if (IsNested)3396BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);33973398BuildMI(allocMBB, DL, TII.get(getMOVriOpcode(IsLP64, StackSize)), Reg10)3399.addImm(StackSize);3400BuildMI(allocMBB, DL,3401TII.get(getMOVriOpcode(IsLP64, X86FI->getArgumentStackSize())),3402Reg11)3403.addImm(X86FI->getArgumentStackSize());3404} else {3405BuildMI(allocMBB, DL, TII.get(X86::PUSH32i))3406.addImm(X86FI->getArgumentStackSize());3407BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)).addImm(StackSize);3408}34093410// __morestack is in libgcc3411if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {3412// Under the large code model, we cannot assume that __morestack lives3413// within 2^31 bytes of the call site, so we cannot use pc-relative3414// addressing. We cannot perform the call via a temporary register,3415// as the rax register may be used to store the static chain, and all3416// other suitable registers may be either callee-save or used for3417// parameter passing. We cannot use the stack at this point either3418// because __morestack manipulates the stack directly.3419//3420// To avoid these issues, perform an indirect call via a read-only memory3421// location containing the address.3422//3423// This solution is not perfect, as it assumes that the .rodata section3424// is laid out within 2^31 bytes of each function body, but this seems3425// to be sufficient for JIT.3426// FIXME: Add retpoline support and remove the error here..3427if (STI.useIndirectThunkCalls())3428report_fatal_error("Emitting morestack calls on 64-bit with the large "3429"code model and thunks not yet implemented.");3430BuildMI(allocMBB, DL, TII.get(X86::CALL64m))3431.addReg(X86::RIP)3432.addImm(0)3433.addReg(0)3434.addExternalSymbol("__morestack_addr")3435.addReg(0);3436} else {3437if (Is64Bit)3438BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))3439.addExternalSymbol("__morestack");3440else3441BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))3442.addExternalSymbol("__morestack");3443}34443445if (IsNested)3446BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));3447else3448BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));34493450allocMBB->addSuccessor(&PrologueMBB);34513452checkMBB->addSuccessor(allocMBB, BranchProbability::getZero());3453checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne());34543455#ifdef EXPENSIVE_CHECKS3456MF.verify();3457#endif3458}34593460/// Lookup an ERTS parameter in the !hipe.literals named metadata node.3461/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets3462/// to fields it needs, through a named metadata node "hipe.literals" containing3463/// name-value pairs.3464static unsigned getHiPELiteral(NamedMDNode *HiPELiteralsMD,3465const StringRef LiteralName) {3466for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {3467MDNode *Node = HiPELiteralsMD->getOperand(i);3468if (Node->getNumOperands() != 2)3469continue;3470MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));3471ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));3472if (!NodeName || !NodeVal)3473continue;3474ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());3475if (ValConst && NodeName->getString() == LiteralName) {3476return ValConst->getZExtValue();3477}3478}34793480report_fatal_error("HiPE literal " + LiteralName +3481" required but not provided");3482}34833484// Return true if there are no non-ehpad successors to MBB and there are no3485// non-meta instructions between MBBI and MBB.end().3486static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,3487MachineBasicBlock::const_iterator MBBI) {3488return llvm::all_of(3489MBB.successors(),3490[](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&3491std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {3492return MI.isMetaInstruction();3493});3494}34953496/// Erlang programs may need a special prologue to handle the stack size they3497/// might need at runtime. That is because Erlang/OTP does not implement a C3498/// stack but uses a custom implementation of hybrid stack/heap architecture.3499/// (for more information see Eric Stenman's Ph.D. thesis:3500/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)3501///3502/// CheckStack:3503/// temp0 = sp - MaxStack3504/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart3505/// OldStart:3506/// ...3507/// IncStack:3508/// call inc_stack # doubles the stack space3509/// temp0 = sp - MaxStack3510/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart3511void X86FrameLowering::adjustForHiPEPrologue(3512MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {3513MachineFrameInfo &MFI = MF.getFrameInfo();3514DebugLoc DL;35153516// To support shrink-wrapping we would need to insert the new blocks3517// at the right place and update the branches to PrologueMBB.3518assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");35193520// HiPE-specific values3521NamedMDNode *HiPELiteralsMD =3522MF.getFunction().getParent()->getNamedMetadata("hipe.literals");3523if (!HiPELiteralsMD)3524report_fatal_error(3525"Can't generate HiPE prologue without runtime parameters");3526const unsigned HipeLeafWords = getHiPELiteral(3527HiPELiteralsMD, Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");3528const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;3529const unsigned Guaranteed = HipeLeafWords * SlotSize;3530unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs3531? MF.getFunction().arg_size() - CCRegisteredArgs3532: 0;3533unsigned MaxStack = MFI.getStackSize() + CallerStkArity * SlotSize + SlotSize;35343535assert(STI.isTargetLinux() &&3536"HiPE prologue is only supported on Linux operating systems.");35373538// Compute the largest caller's frame that is needed to fit the callees'3539// frames. This 'MaxStack' is computed from:3540//3541// a) the fixed frame size, which is the space needed for all spilled temps,3542// b) outgoing on-stack parameter areas, and3543// c) the minimum stack space this function needs to make available for the3544// functions it calls (a tunable ABI property).3545if (MFI.hasCalls()) {3546unsigned MoreStackForCalls = 0;35473548for (auto &MBB : MF) {3549for (auto &MI : MBB) {3550if (!MI.isCall())3551continue;35523553// Get callee operand.3554const MachineOperand &MO = MI.getOperand(0);35553556// Only take account of global function calls (no closures etc.).3557if (!MO.isGlobal())3558continue;35593560const Function *F = dyn_cast<Function>(MO.getGlobal());3561if (!F)3562continue;35633564// Do not update 'MaxStack' for primitive and built-in functions3565// (encoded with names either starting with "erlang."/"bif_" or not3566// having a ".", such as a simple <Module>.<Function>.<Arity>, or an3567// "_", such as the BIF "suspend_0") as they are executed on another3568// stack.3569if (F->getName().contains("erlang.") || F->getName().contains("bif_") ||3570F->getName().find_first_of("._") == StringRef::npos)3571continue;35723573unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs3574? F->arg_size() - CCRegisteredArgs3575: 0;3576if (HipeLeafWords - 1 > CalleeStkArity)3577MoreStackForCalls =3578std::max(MoreStackForCalls,3579(HipeLeafWords - 1 - CalleeStkArity) * SlotSize);3580}3581}3582MaxStack += MoreStackForCalls;3583}35843585// If the stack frame needed is larger than the guaranteed then runtime checks3586// and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.3587if (MaxStack > Guaranteed) {3588MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();3589MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();35903591for (const auto &LI : PrologueMBB.liveins()) {3592stackCheckMBB->addLiveIn(LI);3593incStackMBB->addLiveIn(LI);3594}35953596MF.push_front(incStackMBB);3597MF.push_front(stackCheckMBB);35983599unsigned ScratchReg, SPReg, PReg, SPLimitOffset;3600unsigned LEAop, CMPop, CALLop;3601SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");3602if (Is64Bit) {3603SPReg = X86::RSP;3604PReg = X86::RBP;3605LEAop = X86::LEA64r;3606CMPop = X86::CMP64rm;3607CALLop = X86::CALL64pcrel32;3608} else {3609SPReg = X86::ESP;3610PReg = X86::EBP;3611LEAop = X86::LEA32r;3612CMPop = X86::CMP32rm;3613CALLop = X86::CALLpcrel32;3614}36153616ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);3617assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&3618"HiPE prologue scratch register is live-in");36193620// Create new MBB for StackCheck:3621addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg,3622false, -MaxStack);3623// SPLimitOffset is in a fixed heap location (pointed by BP).3624addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)).addReg(ScratchReg),3625PReg, false, SPLimitOffset);3626BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1))3627.addMBB(&PrologueMBB)3628.addImm(X86::COND_AE);36293630// Create new MBB for IncStack:3631BuildMI(incStackMBB, DL, TII.get(CALLop)).addExternalSymbol("inc_stack_0");3632addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg,3633false, -MaxStack);3634addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)).addReg(ScratchReg),3635PReg, false, SPLimitOffset);3636BuildMI(incStackMBB, DL, TII.get(X86::JCC_1))3637.addMBB(incStackMBB)3638.addImm(X86::COND_LE);36393640stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});3641stackCheckMBB->addSuccessor(incStackMBB, {1, 100});3642incStackMBB->addSuccessor(&PrologueMBB, {99, 100});3643incStackMBB->addSuccessor(incStackMBB, {1, 100});3644}3645#ifdef EXPENSIVE_CHECKS3646MF.verify();3647#endif3648}36493650bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,3651MachineBasicBlock::iterator MBBI,3652const DebugLoc &DL,3653int Offset) const {3654if (Offset <= 0)3655return false;36563657if (Offset % SlotSize)3658return false;36593660int NumPops = Offset / SlotSize;3661// This is only worth it if we have at most 2 pops.3662if (NumPops != 1 && NumPops != 2)3663return false;36643665// Handle only the trivial case where the adjustment directly follows3666// a call. This is the most common one, anyway.3667if (MBBI == MBB.begin())3668return false;3669MachineBasicBlock::iterator Prev = std::prev(MBBI);3670if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())3671return false;36723673unsigned Regs[2];3674unsigned FoundRegs = 0;36753676const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();3677const MachineOperand &RegMask = Prev->getOperand(1);36783679auto &RegClass =3680Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;3681// Try to find up to NumPops free registers.3682for (auto Candidate : RegClass) {3683// Poor man's liveness:3684// Since we're immediately after a call, any register that is clobbered3685// by the call and not defined by it can be considered dead.3686if (!RegMask.clobbersPhysReg(Candidate))3687continue;36883689// Don't clobber reserved registers3690if (MRI.isReserved(Candidate))3691continue;36923693bool IsDef = false;3694for (const MachineOperand &MO : Prev->implicit_operands()) {3695if (MO.isReg() && MO.isDef() &&3696TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {3697IsDef = true;3698break;3699}3700}37013702if (IsDef)3703continue;37043705Regs[FoundRegs++] = Candidate;3706if (FoundRegs == (unsigned)NumPops)3707break;3708}37093710if (FoundRegs == 0)3711return false;37123713// If we found only one free register, but need two, reuse the same one twice.3714while (FoundRegs < (unsigned)NumPops)3715Regs[FoundRegs++] = Regs[0];37163717for (int i = 0; i < NumPops; ++i)3718BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r),3719Regs[i]);37203721return true;3722}37233724MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(3725MachineFunction &MF, MachineBasicBlock &MBB,3726MachineBasicBlock::iterator I) const {3727bool reserveCallFrame = hasReservedCallFrame(MF);3728unsigned Opcode = I->getOpcode();3729bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();3730DebugLoc DL = I->getDebugLoc(); // copy DebugLoc as I will be erased.3731uint64_t Amount = TII.getFrameSize(*I);3732uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;3733I = MBB.erase(I);3734auto InsertPos = skipDebugInstructionsForward(I, MBB.end());37353736// Try to avoid emitting dead SP adjustments if the block end is unreachable,3737// typically because the function is marked noreturn (abort, throw,3738// assert_fail, etc).3739if (isDestroy && blockEndIsUnreachable(MBB, I))3740return I;37413742if (!reserveCallFrame) {3743// If the stack pointer can be changed after prologue, turn the3744// adjcallstackup instruction into a 'sub ESP, <amt>' and the3745// adjcallstackdown instruction into 'add ESP, <amt>'37463747// We need to keep the stack aligned properly. To do this, we round the3748// amount of space needed for the outgoing arguments up to the next3749// alignment boundary.3750Amount = alignTo(Amount, getStackAlign());37513752const Function &F = MF.getFunction();3753bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();3754bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();37553756// If we have any exception handlers in this function, and we adjust3757// the SP before calls, we may need to indicate this to the unwinder3758// using GNU_ARGS_SIZE. Note that this may be necessary even when3759// Amount == 0, because the preceding function may have set a non-03760// GNU_ARGS_SIZE.3761// TODO: We don't need to reset this between subsequent functions,3762// if it didn't change.3763bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();37643765if (HasDwarfEHHandlers && !isDestroy &&3766MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())3767BuildCFI(MBB, InsertPos, DL,3768MCCFIInstruction::createGnuArgsSize(nullptr, Amount));37693770if (Amount == 0)3771return I;37723773// Factor out the amount that gets handled inside the sequence3774// (Pushes of argument for frame setup, callee pops for frame destroy)3775Amount -= InternalAmt;37763777// TODO: This is needed only if we require precise CFA.3778// If this is a callee-pop calling convention, emit a CFA adjust for3779// the amount the callee popped.3780if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))3781BuildCFI(MBB, InsertPos, DL,3782MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));37833784// Add Amount to SP to destroy a frame, or subtract to setup.3785int64_t StackAdjustment = isDestroy ? Amount : -Amount;37863787if (StackAdjustment) {3788// Merge with any previous or following adjustment instruction. Note: the3789// instructions merged with here do not have CFI, so their stack3790// adjustments do not feed into CfaAdjustment.3791StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);3792StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);37933794if (StackAdjustment) {3795if (!(F.hasMinSize() &&3796adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))3797BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,3798/*InEpilogue=*/false);3799}3800}38013802if (DwarfCFI && !hasFP(MF)) {3803// If we don't have FP, but need to generate unwind information,3804// we need to set the correct CFA offset after the stack adjustment.3805// How much we adjust the CFA offset depends on whether we're emitting3806// CFI only for EH purposes or for debugging. EH only requires the CFA3807// offset to be correct at each call site, while for debugging we want3808// it to be more precise.38093810int64_t CfaAdjustment = -StackAdjustment;3811// TODO: When not using precise CFA, we also need to adjust for the3812// InternalAmt here.3813if (CfaAdjustment) {3814BuildCFI(3815MBB, InsertPos, DL,3816MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));3817}3818}38193820return I;3821}38223823if (InternalAmt) {3824MachineBasicBlock::iterator CI = I;3825MachineBasicBlock::iterator B = MBB.begin();3826while (CI != B && !std::prev(CI)->isCall())3827--CI;3828BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);3829}38303831return I;3832}38333834bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {3835assert(MBB.getParent() && "Block is not attached to a function!");3836const MachineFunction &MF = *MBB.getParent();3837if (!MBB.isLiveIn(X86::EFLAGS))3838return true;38393840// If stack probes have to loop inline or call, that will clobber EFLAGS.3841// FIXME: we could allow cases that will use emitStackProbeInlineGenericBlock.3842const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();3843const X86TargetLowering &TLI = *STI.getTargetLowering();3844if (TLI.hasInlineStackProbe(MF) || TLI.hasStackProbeSymbol(MF))3845return false;38463847const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();3848return !TRI->hasStackRealignment(MF) && !X86FI->hasSwiftAsyncContext();3849}38503851bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {3852assert(MBB.getParent() && "Block is not attached to a function!");38533854// Win64 has strict requirements in terms of epilogue and we are3855// not taking a chance at messing with them.3856// I.e., unless this block is already an exit block, we can't use3857// it as an epilogue.3858if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())3859return false;38603861// Swift async context epilogue has a BTR instruction that clobbers parts of3862// EFLAGS.3863const MachineFunction &MF = *MBB.getParent();3864if (MF.getInfo<X86MachineFunctionInfo>()->hasSwiftAsyncContext())3865return !flagsNeedToBePreservedBeforeTheTerminators(MBB);38663867if (canUseLEAForSPInEpilogue(*MBB.getParent()))3868return true;38693870// If we cannot use LEA to adjust SP, we may need to use ADD, which3871// clobbers the EFLAGS. Check that we do not need to preserve it,3872// otherwise, conservatively assume this is not3873// safe to insert the epilogue here.3874return !flagsNeedToBePreservedBeforeTheTerminators(MBB);3875}38763877bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {3878// If we may need to emit frameless compact unwind information, give3879// up as this is currently broken: PR25614.3880bool CompactUnwind =3881MF.getContext().getObjectFileInfo()->getCompactUnwindSection() != nullptr;3882return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||3883!CompactUnwind) &&3884// The lowering of segmented stack and HiPE only support entry3885// blocks as prologue blocks: PR26107. This limitation may be3886// lifted if we fix:3887// - adjustForSegmentedStacks3888// - adjustForHiPEPrologue3889MF.getFunction().getCallingConv() != CallingConv::HiPE &&3890!MF.shouldSplitStack();3891}38923893MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(3894MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,3895const DebugLoc &DL, bool RestoreSP) const {3896assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");3897assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");3898assert(STI.is32Bit() && !Uses64BitFramePtr &&3899"restoring EBP/ESI on non-32-bit target");39003901MachineFunction &MF = *MBB.getParent();3902Register FramePtr = TRI->getFrameRegister(MF);3903Register BasePtr = TRI->getBaseRegister();3904WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();3905X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();3906MachineFrameInfo &MFI = MF.getFrameInfo();39073908// FIXME: Don't set FrameSetup flag in catchret case.39093910int FI = FuncInfo.EHRegNodeFrameIndex;3911int EHRegSize = MFI.getObjectSize(FI);39123913if (RestoreSP) {3914// MOV32rm -EHRegSize(%ebp), %esp3915addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),3916X86::EBP, true, -EHRegSize)3917.setMIFlag(MachineInstr::FrameSetup);3918}39193920Register UsedReg;3921int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();3922int EndOffset = -EHRegOffset - EHRegSize;3923FuncInfo.EHRegNodeEndOffset = EndOffset;39243925if (UsedReg == FramePtr) {3926// ADD $offset, %ebp3927unsigned ADDri = getADDriOpcode(false);3928BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)3929.addReg(FramePtr)3930.addImm(EndOffset)3931.setMIFlag(MachineInstr::FrameSetup)3932->getOperand(3)3933.setIsDead();3934assert(EndOffset >= 0 &&3935"end of registration object above normal EBP position!");3936} else if (UsedReg == BasePtr) {3937// LEA offset(%ebp), %esi3938addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),3939FramePtr, false, EndOffset)3940.setMIFlag(MachineInstr::FrameSetup);3941// MOV32rm SavedEBPOffset(%esi), %ebp3942assert(X86FI->getHasSEHFramePtrSave());3943int Offset =3944getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)3945.getFixed();3946assert(UsedReg == BasePtr);3947addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),3948UsedReg, true, Offset)3949.setMIFlag(MachineInstr::FrameSetup);3950} else {3951llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");3952}3953return MBBI;3954}39553956int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {3957return TRI->getSlotSize();3958}39593960Register3961X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const {3962return StackPtr;3963}39643965TargetFrameLowering::DwarfFrameBase3966X86FrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {3967const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();3968Register FrameRegister = RI->getFrameRegister(MF);3969if (getInitialCFARegister(MF) == FrameRegister &&3970MF.getInfo<X86MachineFunctionInfo>()->hasCFIAdjustCfa()) {3971DwarfFrameBase FrameBase;3972FrameBase.Kind = DwarfFrameBase::CFA;3973FrameBase.Location.Offset =3974-MF.getFrameInfo().getStackSize() - getInitialCFAOffset(MF);3975return FrameBase;3976}39773978return DwarfFrameBase{DwarfFrameBase::Register, {FrameRegister}};3979}39803981namespace {3982// Struct used by orderFrameObjects to help sort the stack objects.3983struct X86FrameSortingObject {3984bool IsValid = false; // true if we care about this Object.3985unsigned ObjectIndex = 0; // Index of Object into MFI list.3986unsigned ObjectSize = 0; // Size of Object in bytes.3987Align ObjectAlignment = Align(1); // Alignment of Object in bytes.3988unsigned ObjectNumUses = 0; // Object static number of uses.3989};39903991// The comparison function we use for std::sort to order our local3992// stack symbols. The current algorithm is to use an estimated3993// "density". This takes into consideration the size and number of3994// uses each object has in order to roughly minimize code size.3995// So, for example, an object of size 16B that is referenced 5 times3996// will get higher priority than 4 4B objects referenced 1 time each.3997// It's not perfect and we may be able to squeeze a few more bytes out of3998// it (for example : 0(esp) requires fewer bytes, symbols allocated at the3999// fringe end can have special consideration, given their size is less4000// important, etc.), but the algorithmic complexity grows too much to be4001// worth the extra gains we get. This gets us pretty close.4002// The final order leaves us with objects with highest priority going4003// at the end of our list.4004struct X86FrameSortingComparator {4005inline bool operator()(const X86FrameSortingObject &A,4006const X86FrameSortingObject &B) const {4007uint64_t DensityAScaled, DensityBScaled;40084009// For consistency in our comparison, all invalid objects are placed4010// at the end. This also allows us to stop walking when we hit the4011// first invalid item after it's all sorted.4012if (!A.IsValid)4013return false;4014if (!B.IsValid)4015return true;40164017// The density is calculated by doing :4018// (double)DensityA = A.ObjectNumUses / A.ObjectSize4019// (double)DensityB = B.ObjectNumUses / B.ObjectSize4020// Since this approach may cause inconsistencies in4021// the floating point <, >, == comparisons, depending on the floating4022// point model with which the compiler was built, we're going4023// to scale both sides by multiplying with4024// A.ObjectSize * B.ObjectSize. This ends up factoring away4025// the division and, with it, the need for any floating point4026// arithmetic.4027DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *4028static_cast<uint64_t>(B.ObjectSize);4029DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *4030static_cast<uint64_t>(A.ObjectSize);40314032// If the two densities are equal, prioritize highest alignment4033// objects. This allows for similar alignment objects4034// to be packed together (given the same density).4035// There's room for improvement here, also, since we can pack4036// similar alignment (different density) objects next to each4037// other to save padding. This will also require further4038// complexity/iterations, and the overall gain isn't worth it,4039// in general. Something to keep in mind, though.4040if (DensityAScaled == DensityBScaled)4041return A.ObjectAlignment < B.ObjectAlignment;40424043return DensityAScaled < DensityBScaled;4044}4045};4046} // namespace40474048// Order the symbols in the local stack.4049// We want to place the local stack objects in some sort of sensible order.4050// The heuristic we use is to try and pack them according to static number4051// of uses and size of object in order to minimize code size.4052void X86FrameLowering::orderFrameObjects(4053const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {4054const MachineFrameInfo &MFI = MF.getFrameInfo();40554056// Don't waste time if there's nothing to do.4057if (ObjectsToAllocate.empty())4058return;40594060// Create an array of all MFI objects. We won't need all of these4061// objects, but we're going to create a full array of them to make4062// it easier to index into when we're counting "uses" down below.4063// We want to be able to easily/cheaply access an object by simply4064// indexing into it, instead of having to search for it every time.4065std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());40664067// Walk the objects we care about and mark them as such in our working4068// struct.4069for (auto &Obj : ObjectsToAllocate) {4070SortingObjects[Obj].IsValid = true;4071SortingObjects[Obj].ObjectIndex = Obj;4072SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj);4073// Set the size.4074int ObjectSize = MFI.getObjectSize(Obj);4075if (ObjectSize == 0)4076// Variable size. Just use 4.4077SortingObjects[Obj].ObjectSize = 4;4078else4079SortingObjects[Obj].ObjectSize = ObjectSize;4080}40814082// Count the number of uses for each object.4083for (auto &MBB : MF) {4084for (auto &MI : MBB) {4085if (MI.isDebugInstr())4086continue;4087for (const MachineOperand &MO : MI.operands()) {4088// Check to see if it's a local stack symbol.4089if (!MO.isFI())4090continue;4091int Index = MO.getIndex();4092// Check to see if it falls within our range, and is tagged4093// to require ordering.4094if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&4095SortingObjects[Index].IsValid)4096SortingObjects[Index].ObjectNumUses++;4097}4098}4099}41004101// Sort the objects using X86FrameSortingAlgorithm (see its comment for4102// info).4103llvm::stable_sort(SortingObjects, X86FrameSortingComparator());41044105// Now modify the original list to represent the final order that4106// we want. The order will depend on whether we're going to access them4107// from the stack pointer or the frame pointer. For SP, the list should4108// end up with the END containing objects that we want with smaller offsets.4109// For FP, it should be flipped.4110int i = 0;4111for (auto &Obj : SortingObjects) {4112// All invalid items are sorted at the end, so it's safe to stop.4113if (!Obj.IsValid)4114break;4115ObjectsToAllocate[i++] = Obj.ObjectIndex;4116}41174118// Flip it if we're accessing off of the FP.4119if (!TRI->hasStackRealignment(MF) && hasFP(MF))4120std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());4121}41224123unsigned4124X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {4125// RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.4126unsigned Offset = 16;4127// RBP is immediately pushed.4128Offset += SlotSize;4129// All callee-saved registers are then pushed.4130Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();4131// Every funclet allocates enough stack space for the largest outgoing call.4132Offset += getWinEHFuncletFrameSize(MF);4133return Offset;4134}41354136void X86FrameLowering::processFunctionBeforeFrameFinalized(4137MachineFunction &MF, RegScavenger *RS) const {4138// Mark the function as not having WinCFI. We will set it back to true in4139// emitPrologue if it gets called and emits CFI.4140MF.setHasWinCFI(false);41414142// If we are using Windows x64 CFI, ensure that the stack is always 8 byte4143// aligned. The format doesn't support misaligned stack adjustments.4144if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())4145MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));41464147// If this function isn't doing Win64-style C++ EH, we don't need to do4148// anything.4149if (STI.is64Bit() && MF.hasEHFunclets() &&4150classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==4151EHPersonality::MSVC_CXX) {4152adjustFrameForMsvcCxxEh(MF);4153}4154}41554156void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {4157// Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset4158// relative to RSP after the prologue. Find the offset of the last fixed4159// object, so that we can allocate a slot immediately following it. If there4160// were no fixed objects, use offset -SlotSize, which is immediately after the4161// return address. Fixed objects have negative frame indices.4162MachineFrameInfo &MFI = MF.getFrameInfo();4163WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();4164int64_t MinFixedObjOffset = -SlotSize;4165for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)4166MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));41674168for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {4169for (WinEHHandlerType &H : TBME.HandlerArray) {4170int FrameIndex = H.CatchObj.FrameIndex;4171if (FrameIndex != INT_MAX) {4172// Ensure alignment.4173unsigned Align = MFI.getObjectAlign(FrameIndex).value();4174MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;4175MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);4176MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);4177}4178}4179}41804181// Ensure alignment.4182MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;4183int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;4184int UnwindHelpFI =4185MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false);4186EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;41874188// Store -2 into UnwindHelp on function entry. We have to scan forwards past4189// other frame setup instructions.4190MachineBasicBlock &MBB = MF.front();4191auto MBBI = MBB.begin();4192while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))4193++MBBI;41944195DebugLoc DL = MBB.findDebugLoc(MBBI);4196addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),4197UnwindHelpFI)4198.addImm(-2);4199}42004201void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(4202MachineFunction &MF, RegScavenger *RS) const {4203auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();42044205if (STI.is32Bit() && MF.hasEHFunclets())4206restoreWinEHStackPointersInParent(MF);4207// We have emitted prolog and epilog. Don't need stack pointer saving4208// instruction any more.4209if (MachineInstr *MI = X86FI->getStackPtrSaveMI()) {4210MI->eraseFromParent();4211X86FI->setStackPtrSaveMI(nullptr);4212}4213}42144215void X86FrameLowering::restoreWinEHStackPointersInParent(4216MachineFunction &MF) const {4217// 32-bit functions have to restore stack pointers when control is transferred4218// back to the parent function. These blocks are identified as eh pads that4219// are not funclet entries.4220bool IsSEH = isAsynchronousEHPersonality(4221classifyEHPersonality(MF.getFunction().getPersonalityFn()));4222for (MachineBasicBlock &MBB : MF) {4223bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry();4224if (NeedsRestore)4225restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(),4226/*RestoreSP=*/IsSEH);4227}4228}422942304231