Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
35266 views
//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file defines a pass that optimizes call sequences on x86.9// Currently, it converts movs of function parameters onto the stack into10// pushes. This is beneficial for two main reasons:11// 1) The push instruction encoding is much smaller than a stack-ptr-based mov.12// 2) It is possible to push memory arguments directly. So, if the13// the transformation is performed pre-reg-alloc, it can help relieve14// register pressure.15//16//===----------------------------------------------------------------------===//1718#include "MCTargetDesc/X86BaseInfo.h"19#include "X86.h"20#include "X86FrameLowering.h"21#include "X86InstrInfo.h"22#include "X86MachineFunctionInfo.h"23#include "X86RegisterInfo.h"24#include "X86Subtarget.h"25#include "llvm/ADT/DenseSet.h"26#include "llvm/ADT/SmallVector.h"27#include "llvm/ADT/StringRef.h"28#include "llvm/CodeGen/MachineBasicBlock.h"29#include "llvm/CodeGen/MachineFrameInfo.h"30#include "llvm/CodeGen/MachineFunction.h"31#include "llvm/CodeGen/MachineFunctionPass.h"32#include "llvm/CodeGen/MachineInstr.h"33#include "llvm/CodeGen/MachineInstrBuilder.h"34#include "llvm/CodeGen/MachineOperand.h"35#include "llvm/CodeGen/MachineRegisterInfo.h"36#include "llvm/CodeGen/TargetInstrInfo.h"37#include "llvm/CodeGen/TargetRegisterInfo.h"38#include "llvm/IR/DebugLoc.h"39#include "llvm/IR/Function.h"40#include "llvm/MC/MCDwarf.h"41#include "llvm/Support/CommandLine.h"42#include "llvm/Support/ErrorHandling.h"43#include "llvm/Support/MathExtras.h"44#include <cassert>45#include <cstddef>46#include <cstdint>47#include <iterator>4849using namespace llvm;5051#define DEBUG_TYPE "x86-cf-opt"5253static cl::opt<bool>54NoX86CFOpt("no-x86-call-frame-opt",55cl::desc("Avoid optimizing x86 call frames for size"),56cl::init(false), cl::Hidden);5758namespace {5960class X86CallFrameOptimization : public MachineFunctionPass {61public:62X86CallFrameOptimization() : MachineFunctionPass(ID) { }6364bool runOnMachineFunction(MachineFunction &MF) override;6566static char ID;6768private:69// Information we know about a particular call site70struct CallContext {71CallContext() : FrameSetup(nullptr), ArgStoreVector(4, nullptr) {}7273// Iterator referring to the frame setup instruction74MachineBasicBlock::iterator FrameSetup;7576// Actual call instruction77MachineInstr *Call = nullptr;7879// A copy of the stack pointer80MachineInstr *SPCopy = nullptr;8182// The total displacement of all passed parameters83int64_t ExpectedDist = 0;8485// The sequence of storing instructions used to pass the parameters86SmallVector<MachineInstr *, 4> ArgStoreVector;8788// True if this call site has no stack parameters89bool NoStackParams = false;9091// True if this call site can use push instructions92bool UsePush = false;93};9495typedef SmallVector<CallContext, 8> ContextVector;9697bool isLegal(MachineFunction &MF);9899bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);100101void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,102MachineBasicBlock::iterator I, CallContext &Context);103104void adjustCallSequence(MachineFunction &MF, const CallContext &Context);105106MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,107Register Reg);108109enum InstClassification { Convert, Skip, Exit };110111InstClassification classifyInstruction(MachineBasicBlock &MBB,112MachineBasicBlock::iterator MI,113const X86RegisterInfo &RegInfo,114DenseSet<unsigned int> &UsedRegs);115116StringRef getPassName() const override { return "X86 Optimize Call Frame"; }117118const X86InstrInfo *TII = nullptr;119const X86FrameLowering *TFL = nullptr;120const X86Subtarget *STI = nullptr;121MachineRegisterInfo *MRI = nullptr;122unsigned SlotSize = 0;123unsigned Log2SlotSize = 0;124};125126} // end anonymous namespace127char X86CallFrameOptimization::ID = 0;128INITIALIZE_PASS(X86CallFrameOptimization, DEBUG_TYPE,129"X86 Call Frame Optimization", false, false)130131// This checks whether the transformation is legal.132// Also returns false in cases where it's potentially legal, but133// we don't even want to try.134bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {135if (NoX86CFOpt.getValue())136return false;137138// We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset139// in the compact unwind encoding that Darwin uses. So, bail if there140// is a danger of that being generated.141if (STI->isTargetDarwin() &&142(!MF.getLandingPads().empty() ||143(MF.getFunction().needsUnwindTableEntry() && !TFL->hasFP(MF))))144return false;145146// It is not valid to change the stack pointer outside the prolog/epilog147// on 64-bit Windows.148if (STI->isTargetWin64())149return false;150151// You would expect straight-line code between call-frame setup and152// call-frame destroy. You would be wrong. There are circumstances (e.g.153// CMOV_GR8 expansion of a select that feeds a function call!) where we can154// end up with the setup and the destroy in different basic blocks.155// This is bad, and breaks SP adjustment.156// So, check that all of the frames in the function are closed inside157// the same block, and, for good measure, that there are no nested frames.158//159// If any call allocates more argument stack memory than the stack160// probe size, don't do this optimization. Otherwise, this pass161// would need to synthesize additional stack probe calls to allocate162// memory for arguments.163unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();164unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();165bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF);166unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);167for (MachineBasicBlock &BB : MF) {168bool InsideFrameSequence = false;169for (MachineInstr &MI : BB) {170if (MI.getOpcode() == FrameSetupOpcode) {171if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall)172return false;173if (InsideFrameSequence)174return false;175InsideFrameSequence = true;176} else if (MI.getOpcode() == FrameDestroyOpcode) {177if (!InsideFrameSequence)178return false;179InsideFrameSequence = false;180}181}182183if (InsideFrameSequence)184return false;185}186187return true;188}189190// Check whether this transformation is profitable for a particular191// function - in terms of code size.192bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,193ContextVector &CallSeqVector) {194// This transformation is always a win when we do not expect to have195// a reserved call frame. Under other circumstances, it may be either196// a win or a loss, and requires a heuristic.197bool CannotReserveFrame = MF.getFrameInfo().hasVarSizedObjects();198if (CannotReserveFrame)199return true;200201Align StackAlign = TFL->getStackAlign();202203int64_t Advantage = 0;204for (const auto &CC : CallSeqVector) {205// Call sites where no parameters are passed on the stack206// do not affect the cost, since there needs to be no207// stack adjustment.208if (CC.NoStackParams)209continue;210211if (!CC.UsePush) {212// If we don't use pushes for a particular call site,213// we pay for not having a reserved call frame with an214// additional sub/add esp pair. The cost is ~3 bytes per instruction,215// depending on the size of the constant.216// TODO: Callee-pop functions should have a smaller penalty, because217// an add is needed even with a reserved call frame.218Advantage -= 6;219} else {220// We can use pushes. First, account for the fixed costs.221// We'll need a add after the call.222Advantage -= 3;223// If we have to realign the stack, we'll also need a sub before224if (!isAligned(StackAlign, CC.ExpectedDist))225Advantage -= 3;226// Now, for each push, we save ~3 bytes. For small constants, we actually,227// save more (up to 5 bytes), but 3 should be a good approximation.228Advantage += (CC.ExpectedDist >> Log2SlotSize) * 3;229}230}231232return Advantage >= 0;233}234235bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {236STI = &MF.getSubtarget<X86Subtarget>();237TII = STI->getInstrInfo();238TFL = STI->getFrameLowering();239MRI = &MF.getRegInfo();240241const X86RegisterInfo &RegInfo =242*static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());243SlotSize = RegInfo.getSlotSize();244assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");245Log2SlotSize = Log2_32(SlotSize);246247if (skipFunction(MF.getFunction()) || !isLegal(MF))248return false;249250unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();251252bool Changed = false;253254ContextVector CallSeqVector;255256for (auto &MBB : MF)257for (auto &MI : MBB)258if (MI.getOpcode() == FrameSetupOpcode) {259CallContext Context;260collectCallInfo(MF, MBB, MI, Context);261CallSeqVector.push_back(Context);262}263264if (!isProfitable(MF, CallSeqVector))265return false;266267for (const auto &CC : CallSeqVector) {268if (CC.UsePush) {269adjustCallSequence(MF, CC);270Changed = true;271}272}273274return Changed;275}276277X86CallFrameOptimization::InstClassification278X86CallFrameOptimization::classifyInstruction(279MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,280const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {281if (MI == MBB.end())282return Exit;283284// The instructions we actually care about are movs onto the stack or special285// cases of constant-stores to stack286switch (MI->getOpcode()) {287case X86::AND16mi:288case X86::AND32mi:289case X86::AND64mi32: {290const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);291return ImmOp.getImm() == 0 ? Convert : Exit;292}293case X86::OR16mi:294case X86::OR32mi:295case X86::OR64mi32: {296const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);297return ImmOp.getImm() == -1 ? Convert : Exit;298}299case X86::MOV32mi:300case X86::MOV32mr:301case X86::MOV64mi32:302case X86::MOV64mr:303return Convert;304}305306// Not all calling conventions have only stack MOVs between the stack307// adjust and the call.308309// We want to tolerate other instructions, to cover more cases.310// In particular:311// a) PCrel calls, where we expect an additional COPY of the basereg.312// b) Passing frame-index addresses.313// c) Calling conventions that have inreg parameters. These generate314// both copies and movs into registers.315// To avoid creating lots of special cases, allow any instruction316// that does not write into memory, does not def or use the stack317// pointer, and does not def any register that was used by a preceding318// push.319// (Reading from memory is allowed, even if referenced through a320// frame index, since these will get adjusted properly in PEI)321322// The reason for the last condition is that the pushes can't replace323// the movs in place, because the order must be reversed.324// So if we have a MOV32mr that uses EDX, then an instruction that defs325// EDX, and then the call, after the transformation the push will use326// the modified version of EDX, and not the original one.327// Since we are still in SSA form at this point, we only need to328// make sure we don't clobber any *physical* registers that were329// used by an earlier mov that will become a push.330331if (MI->isCall() || MI->mayStore())332return Exit;333334for (const MachineOperand &MO : MI->operands()) {335if (!MO.isReg())336continue;337Register Reg = MO.getReg();338if (!Reg.isPhysical())339continue;340if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))341return Exit;342if (MO.isDef()) {343for (unsigned int U : UsedRegs)344if (RegInfo.regsOverlap(Reg, U))345return Exit;346}347}348349return Skip;350}351352void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,353MachineBasicBlock &MBB,354MachineBasicBlock::iterator I,355CallContext &Context) {356// Check that this particular call sequence is amenable to the357// transformation.358const X86RegisterInfo &RegInfo =359*static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());360361// We expect to enter this at the beginning of a call sequence362assert(I->getOpcode() == TII->getCallFrameSetupOpcode());363MachineBasicBlock::iterator FrameSetup = I++;364Context.FrameSetup = FrameSetup;365366// How much do we adjust the stack? This puts an upper bound on367// the number of parameters actually passed on it.368unsigned int MaxAdjust = TII->getFrameSize(*FrameSetup) >> Log2SlotSize;369370// A zero adjustment means no stack parameters371if (!MaxAdjust) {372Context.NoStackParams = true;373return;374}375376// Skip over DEBUG_VALUE.377// For globals in PIC mode, we can have some LEAs here. Skip them as well.378// TODO: Extend this to something that covers more cases.379while (I->getOpcode() == X86::LEA32r || I->isDebugInstr())380++I;381382Register StackPtr = RegInfo.getStackRegister();383auto StackPtrCopyInst = MBB.end();384// SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual385// register. If it's there, use that virtual register as stack pointer386// instead. Also, we need to locate this instruction so that we can later387// safely ignore it while doing the conservative processing of the call chain.388// The COPY can be located anywhere between the call-frame setup389// instruction and its first use. We use the call instruction as a boundary390// because it is usually cheaper to check if an instruction is a call than391// checking if an instruction uses a register.392for (auto J = I; !J->isCall(); ++J)393if (J->isCopy() && J->getOperand(0).isReg() && J->getOperand(1).isReg() &&394J->getOperand(1).getReg() == StackPtr) {395StackPtrCopyInst = J;396Context.SPCopy = &*J++;397StackPtr = Context.SPCopy->getOperand(0).getReg();398break;399}400401// Scan the call setup sequence for the pattern we're looking for.402// We only handle a simple case - a sequence of store instructions that403// push a sequence of stack-slot-aligned values onto the stack, with404// no gaps between them.405if (MaxAdjust > 4)406Context.ArgStoreVector.resize(MaxAdjust, nullptr);407408DenseSet<unsigned int> UsedRegs;409410for (InstClassification Classification = Skip; Classification != Exit; ++I) {411// If this is the COPY of the stack pointer, it's ok to ignore.412if (I == StackPtrCopyInst)413continue;414Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs);415if (Classification != Convert)416continue;417// We know the instruction has a supported store opcode.418// We only want movs of the form:419// mov imm/reg, k(%StackPtr)420// If we run into something else, bail.421// Note that AddrBaseReg may, counter to its name, not be a register,422// but rather a frame index.423// TODO: Support the fi case. This should probably work now that we424// have the infrastructure to track the stack pointer within a call425// sequence.426if (!I->getOperand(X86::AddrBaseReg).isReg() ||427(I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||428!I->getOperand(X86::AddrScaleAmt).isImm() ||429(I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||430(I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||431(I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||432!I->getOperand(X86::AddrDisp).isImm())433return;434435int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();436assert(StackDisp >= 0 &&437"Negative stack displacement when passing parameters");438439// We really don't want to consider the unaligned case.440if (StackDisp & (SlotSize - 1))441return;442StackDisp >>= Log2SlotSize;443444assert((size_t)StackDisp < Context.ArgStoreVector.size() &&445"Function call has more parameters than the stack is adjusted for.");446447// If the same stack slot is being filled twice, something's fishy.448if (Context.ArgStoreVector[StackDisp] != nullptr)449return;450Context.ArgStoreVector[StackDisp] = &*I;451452for (const MachineOperand &MO : I->uses()) {453if (!MO.isReg())454continue;455Register Reg = MO.getReg();456if (Reg.isPhysical())457UsedRegs.insert(Reg);458}459}460461--I;462463// We now expect the end of the sequence. If we stopped early,464// or reached the end of the block without finding a call, bail.465if (I == MBB.end() || !I->isCall())466return;467468Context.Call = &*I;469if ((++I)->getOpcode() != TII->getCallFrameDestroyOpcode())470return;471472// Now, go through the vector, and see that we don't have any gaps,473// but only a series of storing instructions.474auto MMI = Context.ArgStoreVector.begin(), MME = Context.ArgStoreVector.end();475for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize)476if (*MMI == nullptr)477break;478479// If the call had no parameters, do nothing480if (MMI == Context.ArgStoreVector.begin())481return;482483// We are either at the last parameter, or a gap.484// Make sure it's not a gap485for (; MMI != MME; ++MMI)486if (*MMI != nullptr)487return;488489Context.UsePush = true;490}491492void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,493const CallContext &Context) {494// Ok, we can in fact do the transformation for this call.495// Do not remove the FrameSetup instruction, but adjust the parameters.496// PEI will end up finalizing the handling of this.497MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;498MachineBasicBlock &MBB = *(FrameSetup->getParent());499TII->setFrameAdjustment(*FrameSetup, Context.ExpectedDist);500501const DebugLoc &DL = FrameSetup->getDebugLoc();502bool Is64Bit = STI->is64Bit();503// Now, iterate through the vector in reverse order, and replace the store to504// stack with pushes. MOVmi/MOVmr doesn't have any defs, so no need to505// replace uses.506for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {507MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx];508const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands);509MachineBasicBlock::iterator Push = nullptr;510unsigned PushOpcode;511switch (Store->getOpcode()) {512default:513llvm_unreachable("Unexpected Opcode!");514case X86::AND16mi:515case X86::AND32mi:516case X86::AND64mi32:517case X86::OR16mi:518case X86::OR32mi:519case X86::OR64mi32:520case X86::MOV32mi:521case X86::MOV64mi32:522PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSH32i;523Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);524Push->cloneMemRefs(MF, *Store);525break;526case X86::MOV32mr:527case X86::MOV64mr: {528Register Reg = PushOp.getReg();529530// If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg531// in preparation for the PUSH64. The upper 32 bits can be undef.532if (Is64Bit && Store->getOpcode() == X86::MOV32mr) {533Register UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);534Reg = MRI->createVirtualRegister(&X86::GR64RegClass);535BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);536BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)537.addReg(UndefReg)538.add(PushOp)539.addImm(X86::sub_32bit);540}541542// If PUSHrmm is not slow on this target, try to fold the source of the543// push into the instruction.544bool SlowPUSHrmm = STI->slowTwoMemOps();545546// Check that this is legal to fold. Right now, we're extremely547// conservative about that.548MachineInstr *DefMov = nullptr;549if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {550PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;551Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));552553unsigned NumOps = DefMov->getDesc().getNumOperands();554for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)555Push->addOperand(DefMov->getOperand(i));556Push->cloneMergedMemRefs(MF, {DefMov, &*Store});557DefMov->eraseFromParent();558} else {559PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;560Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))561.addReg(Reg)562.getInstr();563Push->cloneMemRefs(MF, *Store);564}565break;566}567}568569// For debugging, when using SP-based CFA, we need to adjust the CFA570// offset after each push.571// TODO: This is needed only if we require precise CFA.572if (!TFL->hasFP(MF))573TFL->BuildCFI(574MBB, std::next(Push), DL,575MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize));576577MBB.erase(Store);578}579580// The stack-pointer copy is no longer used in the call sequences.581// There should not be any other users, but we can't commit to that, so:582if (Context.SPCopy && MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))583Context.SPCopy->eraseFromParent();584585// Once we've done this, we need to make sure PEI doesn't assume a reserved586// frame.587X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();588FuncInfo->setHasPushSequences(true);589}590591MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(592MachineBasicBlock::iterator FrameSetup, Register Reg) {593// Do an extremely restricted form of load folding.594// ISel will often create patterns like:595// movl 4(%edi), %eax596// movl 8(%edi), %ecx597// movl 12(%edi), %edx598// movl %edx, 8(%esp)599// movl %ecx, 4(%esp)600// movl %eax, (%esp)601// call602// Get rid of those with prejudice.603if (!Reg.isVirtual())604return nullptr;605606// Make sure this is the only use of Reg.607if (!MRI->hasOneNonDBGUse(Reg))608return nullptr;609610MachineInstr &DefMI = *MRI->getVRegDef(Reg);611612// Make sure the def is a MOV from memory.613// If the def is in another block, give up.614if ((DefMI.getOpcode() != X86::MOV32rm &&615DefMI.getOpcode() != X86::MOV64rm) ||616DefMI.getParent() != FrameSetup->getParent())617return nullptr;618619// Make sure we don't have any instructions between DefMI and the620// push that make folding the load illegal.621for (MachineBasicBlock::iterator I = DefMI; I != FrameSetup; ++I)622if (I->isLoadFoldBarrier())623return nullptr;624625return &DefMI;626}627628FunctionPass *llvm::createX86CallFrameOptimization() {629return new X86CallFrameOptimization();630}631632633