CoCalc -- X86FrameLowering.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
³⁵²⁶⁹ views
1
//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file contains the X86 implementation of TargetFrameLowering class.
10
//
11
//===----------------------------------------------------------------------===//
12

13
#include "X86FrameLowering.h"
14
#include "MCTargetDesc/X86MCTargetDesc.h"
15
#include "X86InstrBuilder.h"
16
#include "X86InstrInfo.h"
17
#include "X86MachineFunctionInfo.h"
18
#include "X86Subtarget.h"
19
#include "X86TargetMachine.h"
20
#include "llvm/ADT/Statistic.h"
21
#include "llvm/CodeGen/LivePhysRegs.h"
22
#include "llvm/CodeGen/MachineFrameInfo.h"
23
#include "llvm/CodeGen/MachineFunction.h"
24
#include "llvm/CodeGen/MachineInstrBuilder.h"
25
#include "llvm/CodeGen/MachineModuleInfo.h"
26
#include "llvm/CodeGen/MachineRegisterInfo.h"
27
#include "llvm/CodeGen/WinEHFuncInfo.h"
28
#include "llvm/IR/DataLayout.h"
29
#include "llvm/IR/EHPersonalities.h"
30
#include "llvm/IR/Function.h"
31
#include "llvm/IR/Module.h"
32
#include "llvm/MC/MCAsmInfo.h"
33
#include "llvm/MC/MCObjectFileInfo.h"
34
#include "llvm/MC/MCSymbol.h"
35
#include "llvm/Support/Debug.h"
36
#include "llvm/Support/LEB128.h"
37
#include "llvm/Target/TargetOptions.h"
38
#include <cstdlib>
39

40
#define DEBUG_TYPE "x86-fl"
41

42
STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
43
STATISTIC(NumFrameExtraProbe,
44
          "Number of extra stack probes generated in prologue");
45
STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");
46

47
using namespace llvm;
48

49
X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
50
                                   MaybeAlign StackAlignOverride)
51
    : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
52
                          STI.is64Bit() ? -8 : -4),
53
      STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
54
  // Cache a bunch of frame-related predicates for this subtarget.
55
  SlotSize = TRI->getSlotSize();
56
  Is64Bit = STI.is64Bit();
57
  IsLP64 = STI.isTarget64BitLP64();
58
  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
59
  Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
60
  StackPtr = TRI->getStackRegister();
61
}
62

63
bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
64
  return !MF.getFrameInfo().hasVarSizedObjects() &&
65
         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
66
         !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
67
}
68

69
/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
70
/// call frame pseudos can be simplified.  Having a FP, as in the default
71
/// implementation, is not sufficient here since we can't always use it.
72
/// Use a more nuanced condition.
73
bool X86FrameLowering::canSimplifyCallFramePseudos(
74
    const MachineFunction &MF) const {
75
  return hasReservedCallFrame(MF) ||
76
         MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
77
         (hasFP(MF) && !TRI->hasStackRealignment(MF)) ||
78
         TRI->hasBasePointer(MF);
79
}
80

81
// needsFrameIndexResolution - Do we need to perform FI resolution for
82
// this function. Normally, this is required only when the function
83
// has any stack objects. However, FI resolution actually has another job,
84
// not apparent from the title - it resolves callframesetup/destroy
85
// that were not simplified earlier.
86
// So, this is required for x86 functions that have push sequences even
87
// when there are no stack objects.
88
bool X86FrameLowering::needsFrameIndexResolution(
89
    const MachineFunction &MF) const {
90
  return MF.getFrameInfo().hasStackObjects() ||
91
         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
92
}
93

94
/// hasFP - Return true if the specified function should have a dedicated frame
95
/// pointer register.  This is true if the function has variable sized allocas
96
/// or if frame pointer elimination is disabled.
97
bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
98
  const MachineFrameInfo &MFI = MF.getFrameInfo();
99
  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
100
          TRI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
101
          MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
102
          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
103
          MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
104
          MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
105
          MFI.hasStackMap() || MFI.hasPatchPoint() ||
106
          (isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()));
107
}
108

109
static unsigned getSUBriOpcode(bool IsLP64) {
110
  return IsLP64 ? X86::SUB64ri32 : X86::SUB32ri;
111
}
112

113
static unsigned getADDriOpcode(bool IsLP64) {
114
  return IsLP64 ? X86::ADD64ri32 : X86::ADD32ri;
115
}
116

117
static unsigned getSUBrrOpcode(bool IsLP64) {
118
  return IsLP64 ? X86::SUB64rr : X86::SUB32rr;
119
}
120

121
static unsigned getADDrrOpcode(bool IsLP64) {
122
  return IsLP64 ? X86::ADD64rr : X86::ADD32rr;
123
}
124

125
static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
126
  return IsLP64 ? X86::AND64ri32 : X86::AND32ri;
127
}
128

129
static unsigned getLEArOpcode(bool IsLP64) {
130
  return IsLP64 ? X86::LEA64r : X86::LEA32r;
131
}
132

133
static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
134
  if (Use64BitReg) {
135
    if (isUInt<32>(Imm))
136
      return X86::MOV32ri64;
137
    if (isInt<32>(Imm))
138
      return X86::MOV64ri32;
139
    return X86::MOV64ri;
140
  }
141
  return X86::MOV32ri;
142
}
143

144
// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
145
// value written by the PUSH from the stack. The processor tracks these marked
146
// instructions internally and fast-forwards register data between matching PUSH
147
// and POP instructions, without going through memory or through the training
148
// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
149
// memory-renaming optimization can be used.
150
//
151
// The PPX hint is purely a performance hint. Instructions with this hint have
152
// the same functional semantics as those without. PPX hints set by the
153
// compiler that violate the balancing rule may turn off the PPX optimization,
154
// but they will not affect program semantics.
155
//
156
// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
157
// are not considered).
158
//
159
// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
160
// GPRs at a time to/from the stack.
161
static unsigned getPUSHOpcode(const X86Subtarget &ST) {
162
  return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)
163
                      : X86::PUSH32r;
164
}
165
static unsigned getPOPOpcode(const X86Subtarget &ST) {
166
  return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)
167
                      : X86::POP32r;
168
}
169
static unsigned getPUSH2Opcode(const X86Subtarget &ST) {
170
  return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;
171
}
172
static unsigned getPOP2Opcode(const X86Subtarget &ST) {
173
  return ST.hasPPX() ? X86::POP2P : X86::POP2;
174
}
175

176
static bool isEAXLiveIn(MachineBasicBlock &MBB) {
177
  for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
178
    unsigned Reg = RegMask.PhysReg;
179

180
    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
181
        Reg == X86::AH || Reg == X86::AL)
182
      return true;
183
  }
184

185
  return false;
186
}
187

188
/// Check if the flags need to be preserved before the terminators.
189
/// This would be the case, if the eflags is live-in of the region
190
/// composed by the terminators or live-out of that region, without
191
/// being defined by a terminator.
192
static bool
193
flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
194
  for (const MachineInstr &MI : MBB.terminators()) {
195
    bool BreakNext = false;
196
    for (const MachineOperand &MO : MI.operands()) {
197
      if (!MO.isReg())
198
        continue;
199
      Register Reg = MO.getReg();
200
      if (Reg != X86::EFLAGS)
201
        continue;
202

203
      // This terminator needs an eflags that is not defined
204
      // by a previous another terminator:
205
      // EFLAGS is live-in of the region composed by the terminators.
206
      if (!MO.isDef())
207
        return true;
208
      // This terminator defines the eflags, i.e., we don't need to preserve it.
209
      // However, we still need to check this specific terminator does not
210
      // read a live-in value.
211
      BreakNext = true;
212
    }
213
    // We found a definition of the eflags, no need to preserve them.
214
    if (BreakNext)
215
      return false;
216
  }
217

218
  // None of the terminators use or define the eflags.
219
  // Check if they are live-out, that would imply we need to preserve them.
220
  for (const MachineBasicBlock *Succ : MBB.successors())
221
    if (Succ->isLiveIn(X86::EFLAGS))
222
      return true;
223

224
  return false;
225
}
226

227
/// emitSPUpdate - Emit a series of instructions to increment / decrement the
228
/// stack pointer by a constant value.
229
void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
230
                                    MachineBasicBlock::iterator &MBBI,
231
                                    const DebugLoc &DL, int64_t NumBytes,
232
                                    bool InEpilogue) const {
233
  bool isSub = NumBytes < 0;
234
  uint64_t Offset = isSub ? -NumBytes : NumBytes;
235
  MachineInstr::MIFlag Flag =
236
      isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
237

238
  uint64_t Chunk = (1LL << 31) - 1;
239

240
  MachineFunction &MF = *MBB.getParent();
241
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
242
  const X86TargetLowering &TLI = *STI.getTargetLowering();
243
  const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
244

245
  // It's ok to not take into account large chunks when probing, as the
246
  // allocation is split in smaller chunks anyway.
247
  if (EmitInlineStackProbe && !InEpilogue) {
248

249
    // This pseudo-instruction is going to be expanded, potentially using a
250
    // loop, by inlineStackProbe().
251
    BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset);
252
    return;
253
  } else if (Offset > Chunk) {
254
    // Rather than emit a long series of instructions for large offsets,
255
    // load the offset into a register and do one sub/add
256
    unsigned Reg = 0;
257
    unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
258

259
    if (isSub && !isEAXLiveIn(MBB))
260
      Reg = Rax;
261
    else
262
      Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
263

264
    unsigned AddSubRROpc =
265
        isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
266
    if (Reg) {
267
      BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg)
268
          .addImm(Offset)
269
          .setMIFlag(Flag);
270
      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
271
                             .addReg(StackPtr)
272
                             .addReg(Reg);
273
      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
274
      return;
275
    } else if (Offset > 8 * Chunk) {
276
      // If we would need more than 8 add or sub instructions (a >16GB stack
277
      // frame), it's worth spilling RAX to materialize this immediate.
278
      //   pushq %rax
279
      //   movabsq +-$Offset+-SlotSize, %rax
280
      //   addq %rsp, %rax
281
      //   xchg %rax, (%rsp)
282
      //   movq (%rsp), %rsp
283
      assert(Is64Bit && "can't have 32-bit 16GB stack frame");
284
      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
285
          .addReg(Rax, RegState::Kill)
286
          .setMIFlag(Flag);
287
      // Subtract is not commutative, so negate the offset and always use add.
288
      // Subtract 8 less and add 8 more to account for the PUSH we just did.
289
      if (isSub)
290
        Offset = -(Offset - SlotSize);
291
      else
292
        Offset = Offset + SlotSize;
293
      BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax)
294
          .addImm(Offset)
295
          .setMIFlag(Flag);
296
      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
297
                             .addReg(Rax)
298
                             .addReg(StackPtr);
299
      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
300
      // Exchange the new SP in RAX with the top of the stack.
301
      addRegOffset(
302
          BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
303
          StackPtr, false, 0);
304
      // Load new SP from the top of the stack into RSP.
305
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
306
                   StackPtr, false, 0);
307
      return;
308
    }
309
  }
310

311
  while (Offset) {
312
    uint64_t ThisVal = std::min(Offset, Chunk);
313
    if (ThisVal == SlotSize) {
314
      // Use push / pop for slot sized adjustments as a size optimization. We
315
      // need to find a dead register when using pop.
316
      unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
317
                           : TRI->findDeadCallerSavedReg(MBB, MBBI);
318
      if (Reg) {
319
        unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
320
                             : (Is64Bit ? X86::POP64r : X86::POP32r);
321
        BuildMI(MBB, MBBI, DL, TII.get(Opc))
322
            .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
323
            .setMIFlag(Flag);
324
        Offset -= ThisVal;
325
        continue;
326
      }
327
    }
328

329
    BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
330
        .setMIFlag(Flag);
331

332
    Offset -= ThisVal;
333
  }
334
}
335

336
MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
337
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
338
    const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
339
  assert(Offset != 0 && "zero offset stack adjustment requested");
340

341
  // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
342
  // is tricky.
343
  bool UseLEA;
344
  if (!InEpilogue) {
345
    // Check if inserting the prologue at the beginning
346
    // of MBB would require to use LEA operations.
347
    // We need to use LEA operations if EFLAGS is live in, because
348
    // it means an instruction will read it before it gets defined.
349
    UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
350
  } else {
351
    // If we can use LEA for SP but we shouldn't, check that none
352
    // of the terminators uses the eflags. Otherwise we will insert
353
    // a ADD that will redefine the eflags and break the condition.
354
    // Alternatively, we could move the ADD, but this may not be possible
355
    // and is an optimization anyway.
356
    UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
357
    if (UseLEA && !STI.useLeaForSP())
358
      UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
359
    // If that assert breaks, that means we do not do the right thing
360
    // in canUseAsEpilogue.
361
    assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
362
           "We shouldn't have allowed this insertion point");
363
  }
364

365
  MachineInstrBuilder MI;
366
  if (UseLEA) {
367
    MI = addRegOffset(BuildMI(MBB, MBBI, DL,
368
                              TII.get(getLEArOpcode(Uses64BitFramePtr)),
369
                              StackPtr),
370
                      StackPtr, false, Offset);
371
  } else {
372
    bool IsSub = Offset < 0;
373
    uint64_t AbsOffset = IsSub ? -Offset : Offset;
374
    const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr)
375
                               : getADDriOpcode(Uses64BitFramePtr);
376
    MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
377
             .addReg(StackPtr)
378
             .addImm(AbsOffset);
379
    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
380
  }
381
  return MI;
382
}
383

384
int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
385
                                     MachineBasicBlock::iterator &MBBI,
386
                                     bool doMergeWithPrevious) const {
387
  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
388
      (!doMergeWithPrevious && MBBI == MBB.end()))
389
    return 0;
390

391
  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
392

393
  PI = skipDebugInstructionsBackward(PI, MBB.begin());
394
  // It is assumed that ADD/SUB/LEA instruction is succeded by one CFI
395
  // instruction, and that there are no DBG_VALUE or other instructions between
396
  // ADD/SUB/LEA and its corresponding CFI instruction.
397
  /* TODO: Add support for the case where there are multiple CFI instructions
398
    below the ADD/SUB/LEA, e.g.:
399
    ...
400
    add
401
    cfi_def_cfa_offset
402
    cfi_offset
403
    ...
404
  */
405
  if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
406
    PI = std::prev(PI);
407

408
  unsigned Opc = PI->getOpcode();
409
  int Offset = 0;
410

411
  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
412
      PI->getOperand(0).getReg() == StackPtr) {
413
    assert(PI->getOperand(1).getReg() == StackPtr);
414
    Offset = PI->getOperand(2).getImm();
415
  } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
416
             PI->getOperand(0).getReg() == StackPtr &&
417
             PI->getOperand(1).getReg() == StackPtr &&
418
             PI->getOperand(2).getImm() == 1 &&
419
             PI->getOperand(3).getReg() == X86::NoRegister &&
420
             PI->getOperand(5).getReg() == X86::NoRegister) {
421
    // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
422
    Offset = PI->getOperand(4).getImm();
423
  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
424
             PI->getOperand(0).getReg() == StackPtr) {
425
    assert(PI->getOperand(1).getReg() == StackPtr);
426
    Offset = -PI->getOperand(2).getImm();
427
  } else
428
    return 0;
429

430
  PI = MBB.erase(PI);
431
  if (PI != MBB.end() && PI->isCFIInstruction()) {
432
    auto CIs = MBB.getParent()->getFrameInstructions();
433
    MCCFIInstruction CI = CIs[PI->getOperand(0).getCFIIndex()];
434
    if (CI.getOperation() == MCCFIInstruction::OpDefCfaOffset ||
435
        CI.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)
436
      PI = MBB.erase(PI);
437
  }
438
  if (!doMergeWithPrevious)
439
    MBBI = skipDebugInstructionsForward(PI, MBB.end());
440

441
  return Offset;
442
}
443

444
void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
445
                                MachineBasicBlock::iterator MBBI,
446
                                const DebugLoc &DL,
447
                                const MCCFIInstruction &CFIInst,
448
                                MachineInstr::MIFlag Flag) const {
449
  MachineFunction &MF = *MBB.getParent();
450
  unsigned CFIIndex = MF.addFrameInst(CFIInst);
451

452
  if (CFIInst.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)
453
    MF.getInfo<X86MachineFunctionInfo>()->setHasCFIAdjustCfa(true);
454

455
  BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
456
      .addCFIIndex(CFIIndex)
457
      .setMIFlag(Flag);
458
}
459

460
/// Emits Dwarf Info specifying offsets of callee saved registers and
461
/// frame pointer. This is called only when basic block sections are enabled.
462
void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(
463
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
464
  MachineFunction &MF = *MBB.getParent();
465
  if (!hasFP(MF)) {
466
    emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
467
    return;
468
  }
469
  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
470
  const Register FramePtr = TRI->getFrameRegister(MF);
471
  const Register MachineFramePtr =
472
      STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
473
                               : FramePtr;
474
  unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
475
  // Offset = space for return address + size of the frame pointer itself.
476
  int64_t Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
477
  BuildCFI(MBB, MBBI, DebugLoc{},
478
           MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
479
  emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
480
}
481

482
void X86FrameLowering::emitCalleeSavedFrameMoves(
483
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
484
    const DebugLoc &DL, bool IsPrologue) const {
485
  MachineFunction &MF = *MBB.getParent();
486
  MachineFrameInfo &MFI = MF.getFrameInfo();
487
  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
488
  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
489

490
  // Add callee saved registers to move list.
491
  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
492

493
  // Calculate offsets.
494
  for (const CalleeSavedInfo &I : CSI) {
495
    int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
496
    Register Reg = I.getReg();
497
    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
498

499
    if (IsPrologue) {
500
      if (X86FI->getStackPtrSaveMI()) {
501
        // +2*SlotSize because there is return address and ebp at the bottom
502
        // of the stack.
503
        // | retaddr |
504
        // | ebp     |
505
        // |         |<--ebp
506
        Offset += 2 * SlotSize;
507
        SmallString<64> CfaExpr;
508
        CfaExpr.push_back(dwarf::DW_CFA_expression);
509
        uint8_t buffer[16];
510
        CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
511
        CfaExpr.push_back(2);
512
        Register FramePtr = TRI->getFrameRegister(MF);
513
        const Register MachineFramePtr =
514
            STI.isTarget64BitILP32()
515
                ? Register(getX86SubSuperRegister(FramePtr, 64))
516
                : FramePtr;
517
        unsigned DwarfFramePtr = MRI->getDwarfRegNum(MachineFramePtr, true);
518
        CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));
519
        CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));
520
        BuildCFI(MBB, MBBI, DL,
521
                 MCCFIInstruction::createEscape(nullptr, CfaExpr.str()),
522
                 MachineInstr::FrameSetup);
523
      } else {
524
        BuildCFI(MBB, MBBI, DL,
525
                 MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
526
      }
527
    } else {
528
      BuildCFI(MBB, MBBI, DL,
529
               MCCFIInstruction::createRestore(nullptr, DwarfReg));
530
    }
531
  }
532
  if (auto *MI = X86FI->getStackPtrSaveMI()) {
533
    int FI = MI->getOperand(1).getIndex();
534
    int64_t Offset = MFI.getObjectOffset(FI) + 2 * SlotSize;
535
    SmallString<64> CfaExpr;
536
    Register FramePtr = TRI->getFrameRegister(MF);
537
    const Register MachineFramePtr =
538
        STI.isTarget64BitILP32()
539
            ? Register(getX86SubSuperRegister(FramePtr, 64))
540
            : FramePtr;
541
    unsigned DwarfFramePtr = MRI->getDwarfRegNum(MachineFramePtr, true);
542
    CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));
543
    uint8_t buffer[16];
544
    CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));
545
    CfaExpr.push_back(dwarf::DW_OP_deref);
546

547
    SmallString<64> DefCfaExpr;
548
    DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
549
    DefCfaExpr.append(buffer, buffer + encodeSLEB128(CfaExpr.size(), buffer));
550
    DefCfaExpr.append(CfaExpr.str());
551
    // DW_CFA_def_cfa_expression: DW_OP_breg5 offset, DW_OP_deref
552
    BuildCFI(MBB, MBBI, DL,
553
             MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str()),
554
             MachineInstr::FrameSetup);
555
  }
556
}
557

558
void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
559
                                            MachineBasicBlock &MBB) const {
560
  const MachineFunction &MF = *MBB.getParent();
561

562
  // Insertion point.
563
  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
564

565
  // Fake a debug loc.
566
  DebugLoc DL;
567
  if (MBBI != MBB.end())
568
    DL = MBBI->getDebugLoc();
569

570
  // Zero out FP stack if referenced. Do this outside of the loop below so that
571
  // it's done only once.
572
  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
573
  for (MCRegister Reg : RegsToZero.set_bits()) {
574
    if (!X86::RFP80RegClass.contains(Reg))
575
      continue;
576

577
    unsigned NumFPRegs = ST.is64Bit() ? 8 : 7;
578
    for (unsigned i = 0; i != NumFPRegs; ++i)
579
      BuildMI(MBB, MBBI, DL, TII.get(X86::LD_F0));
580

581
    for (unsigned i = 0; i != NumFPRegs; ++i)
582
      BuildMI(MBB, MBBI, DL, TII.get(X86::ST_FPrr)).addReg(X86::ST0);
583
    break;
584
  }
585

586
  // For GPRs, we only care to clear out the 32-bit register.
587
  BitVector GPRsToZero(TRI->getNumRegs());
588
  for (MCRegister Reg : RegsToZero.set_bits())
589
    if (TRI->isGeneralPurposeRegister(MF, Reg)) {
590
      GPRsToZero.set(getX86SubSuperRegister(Reg, 32));
591
      RegsToZero.reset(Reg);
592
    }
593

594
  // Zero out the GPRs first.
595
  for (MCRegister Reg : GPRsToZero.set_bits())
596
    TII.buildClearRegister(Reg, MBB, MBBI, DL);
597

598
  // Zero out the remaining registers.
599
  for (MCRegister Reg : RegsToZero.set_bits())
600
    TII.buildClearRegister(Reg, MBB, MBBI, DL);
601
}
602

603
void X86FrameLowering::emitStackProbe(
604
    MachineFunction &MF, MachineBasicBlock &MBB,
605
    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
606
    std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
607
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
608
  if (STI.isTargetWindowsCoreCLR()) {
609
    if (InProlog) {
610
      BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING))
611
          .addImm(0 /* no explicit stack size */);
612
    } else {
613
      emitStackProbeInline(MF, MBB, MBBI, DL, false);
614
    }
615
  } else {
616
    emitStackProbeCall(MF, MBB, MBBI, DL, InProlog, InstrNum);
617
  }
618
}
619

620
bool X86FrameLowering::stackProbeFunctionModifiesSP() const {
621
  return STI.isOSWindows() && !STI.isTargetWin64();
622
}
623

624
void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
625
                                        MachineBasicBlock &PrologMBB) const {
626
  auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
627
    return MI.getOpcode() == X86::STACKALLOC_W_PROBING;
628
  });
629
  if (Where != PrologMBB.end()) {
630
    DebugLoc DL = PrologMBB.findDebugLoc(Where);
631
    emitStackProbeInline(MF, PrologMBB, Where, DL, true);
632
    Where->eraseFromParent();
633
  }
634
}
635

636
void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
637
                                            MachineBasicBlock &MBB,
638
                                            MachineBasicBlock::iterator MBBI,
639
                                            const DebugLoc &DL,
640
                                            bool InProlog) const {
641
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
642
  if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())
643
    emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);
644
  else
645
    emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);
646
}
647

648
void X86FrameLowering::emitStackProbeInlineGeneric(
649
    MachineFunction &MF, MachineBasicBlock &MBB,
650
    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
651
  MachineInstr &AllocWithProbe = *MBBI;
652
  uint64_t Offset = AllocWithProbe.getOperand(0).getImm();
653

654
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
655
  const X86TargetLowering &TLI = *STI.getTargetLowering();
656
  assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
657
         "different expansion expected for CoreCLR 64 bit");
658

659
  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
660
  uint64_t ProbeChunk = StackProbeSize * 8;
661

662
  uint64_t MaxAlign =
663
      TRI->hasStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;
664

665
  // Synthesize a loop or unroll it, depending on the number of iterations.
666
  // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
667
  // between the unaligned rsp and current rsp.
668
  if (Offset > ProbeChunk) {
669
    emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
670
                                    MaxAlign % StackProbeSize);
671
  } else {
672
    emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
673
                                     MaxAlign % StackProbeSize);
674
  }
675
}
676

677
void X86FrameLowering::emitStackProbeInlineGenericBlock(
678
    MachineFunction &MF, MachineBasicBlock &MBB,
679
    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
680
    uint64_t AlignOffset) const {
681

682
  const bool NeedsDwarfCFI = needsDwarfCFI(MF);
683
  const bool HasFP = hasFP(MF);
684
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
685
  const X86TargetLowering &TLI = *STI.getTargetLowering();
686
  const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
687
  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
688

689
  uint64_t CurrentOffset = 0;
690

691
  assert(AlignOffset < StackProbeSize);
692

693
  // If the offset is so small it fits within a page, there's nothing to do.
694
  if (StackProbeSize < Offset + AlignOffset) {
695

696
    uint64_t StackAdjustment = StackProbeSize - AlignOffset;
697
    BuildStackAdjustment(MBB, MBBI, DL, -StackAdjustment, /*InEpilogue=*/false)
698
        .setMIFlag(MachineInstr::FrameSetup);
699
    if (!HasFP && NeedsDwarfCFI) {
700
      BuildCFI(
701
          MBB, MBBI, DL,
702
          MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
703
    }
704

705
    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
706
                     .setMIFlag(MachineInstr::FrameSetup),
707
                 StackPtr, false, 0)
708
        .addImm(0)
709
        .setMIFlag(MachineInstr::FrameSetup);
710
    NumFrameExtraProbe++;
711
    CurrentOffset = StackProbeSize - AlignOffset;
712
  }
713

714
  // For the next N - 1 pages, just probe. I tried to take advantage of
715
  // natural probes but it implies much more logic and there was very few
716
  // interesting natural probes to interleave.
717
  while (CurrentOffset + StackProbeSize < Offset) {
718
    BuildStackAdjustment(MBB, MBBI, DL, -StackProbeSize, /*InEpilogue=*/false)
719
        .setMIFlag(MachineInstr::FrameSetup);
720

721
    if (!HasFP && NeedsDwarfCFI) {
722
      BuildCFI(
723
          MBB, MBBI, DL,
724
          MCCFIInstruction::createAdjustCfaOffset(nullptr, StackProbeSize));
725
    }
726
    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
727
                     .setMIFlag(MachineInstr::FrameSetup),
728
                 StackPtr, false, 0)
729
        .addImm(0)
730
        .setMIFlag(MachineInstr::FrameSetup);
731
    NumFrameExtraProbe++;
732
    CurrentOffset += StackProbeSize;
733
  }
734

735
  // No need to probe the tail, it is smaller than a Page.
736
  uint64_t ChunkSize = Offset - CurrentOffset;
737
  if (ChunkSize == SlotSize) {
738
    // Use push for slot sized adjustments as a size optimization,
739
    // like emitSPUpdate does when not probing.
740
    unsigned Reg = Is64Bit ? X86::RAX : X86::EAX;
741
    unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
742
    BuildMI(MBB, MBBI, DL, TII.get(Opc))
743
        .addReg(Reg, RegState::Undef)
744
        .setMIFlag(MachineInstr::FrameSetup);
745
  } else {
746
    BuildStackAdjustment(MBB, MBBI, DL, -ChunkSize, /*InEpilogue=*/false)
747
        .setMIFlag(MachineInstr::FrameSetup);
748
  }
749
  // No need to adjust Dwarf CFA offset here, the last position of the stack has
750
  // been defined
751
}
752

753
void X86FrameLowering::emitStackProbeInlineGenericLoop(
754
    MachineFunction &MF, MachineBasicBlock &MBB,
755
    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
756
    uint64_t AlignOffset) const {
757
  assert(Offset && "null offset");
758

759
  assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
760
             MachineBasicBlock::LQR_Live &&
761
         "Inline stack probe loop will clobber live EFLAGS.");
762

763
  const bool NeedsDwarfCFI = needsDwarfCFI(MF);
764
  const bool HasFP = hasFP(MF);
765
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
766
  const X86TargetLowering &TLI = *STI.getTargetLowering();
767
  const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
768
  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
769

770
  if (AlignOffset) {
771
    if (AlignOffset < StackProbeSize) {
772
      // Perform a first smaller allocation followed by a probe.
773
      BuildStackAdjustment(MBB, MBBI, DL, -AlignOffset, /*InEpilogue=*/false)
774
          .setMIFlag(MachineInstr::FrameSetup);
775

776
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
777
                       .setMIFlag(MachineInstr::FrameSetup),
778
                   StackPtr, false, 0)
779
          .addImm(0)
780
          .setMIFlag(MachineInstr::FrameSetup);
781
      NumFrameExtraProbe++;
782
      Offset -= AlignOffset;
783
    }
784
  }
785

786
  // Synthesize a loop
787
  NumFrameLoopProbe++;
788
  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
789

790
  MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);
791
  MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);
792

793
  MachineFunction::iterator MBBIter = ++MBB.getIterator();
794
  MF.insert(MBBIter, testMBB);
795
  MF.insert(MBBIter, tailMBB);
796

797
  Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
798
                              : Is64Bit         ? X86::R11D
799
                                                : X86::EAX;
800

801
  BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
802
      .addReg(StackPtr)
803
      .setMIFlag(MachineInstr::FrameSetup);
804

805
  // save loop bound
806
  {
807
    const unsigned BoundOffset = alignDown(Offset, StackProbeSize);
808
    const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
809
    BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
810
        .addReg(FinalStackProbed)
811
        .addImm(BoundOffset)
812
        .setMIFlag(MachineInstr::FrameSetup);
813

814
    // while in the loop, use loop-invariant reg for CFI,
815
    // instead of the stack pointer, which changes during the loop
816
    if (!HasFP && NeedsDwarfCFI) {
817
      // x32 uses the same DWARF register numbers as x86-64,
818
      // so there isn't a register number for r11d, we must use r11 instead
819
      const Register DwarfFinalStackProbed =
820
          STI.isTarget64BitILP32()
821
              ? Register(getX86SubSuperRegister(FinalStackProbed, 64))
822
              : FinalStackProbed;
823

824
      BuildCFI(MBB, MBBI, DL,
825
               MCCFIInstruction::createDefCfaRegister(
826
                   nullptr, TRI->getDwarfRegNum(DwarfFinalStackProbed, true)));
827
      BuildCFI(MBB, MBBI, DL,
828
               MCCFIInstruction::createAdjustCfaOffset(nullptr, BoundOffset));
829
    }
830
  }
831

832
  // allocate a page
833
  BuildStackAdjustment(*testMBB, testMBB->end(), DL, -StackProbeSize,
834
                       /*InEpilogue=*/false)
835
      .setMIFlag(MachineInstr::FrameSetup);
836

837
  // touch the page
838
  addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
839
                   .setMIFlag(MachineInstr::FrameSetup),
840
               StackPtr, false, 0)
841
      .addImm(0)
842
      .setMIFlag(MachineInstr::FrameSetup);
843

844
  // cmp with stack pointer bound
845
  BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
846
      .addReg(StackPtr)
847
      .addReg(FinalStackProbed)
848
      .setMIFlag(MachineInstr::FrameSetup);
849

850
  // jump
851
  BuildMI(testMBB, DL, TII.get(X86::JCC_1))
852
      .addMBB(testMBB)
853
      .addImm(X86::COND_NE)
854
      .setMIFlag(MachineInstr::FrameSetup);
855
  testMBB->addSuccessor(testMBB);
856
  testMBB->addSuccessor(tailMBB);
857

858
  // BB management
859
  tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
860
  tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);
861
  MBB.addSuccessor(testMBB);
862

863
  // handle tail
864
  const uint64_t TailOffset = Offset % StackProbeSize;
865
  MachineBasicBlock::iterator TailMBBIter = tailMBB->begin();
866
  if (TailOffset) {
867
    BuildStackAdjustment(*tailMBB, TailMBBIter, DL, -TailOffset,
868
                         /*InEpilogue=*/false)
869
        .setMIFlag(MachineInstr::FrameSetup);
870
  }
871

872
  // after the loop, switch back to stack pointer for CFI
873
  if (!HasFP && NeedsDwarfCFI) {
874
    // x32 uses the same DWARF register numbers as x86-64,
875
    // so there isn't a register number for esp, we must use rsp instead
876
    const Register DwarfStackPtr =
877
        STI.isTarget64BitILP32()
878
            ? Register(getX86SubSuperRegister(StackPtr, 64))
879
            : Register(StackPtr);
880

881
    BuildCFI(*tailMBB, TailMBBIter, DL,
882
             MCCFIInstruction::createDefCfaRegister(
883
                 nullptr, TRI->getDwarfRegNum(DwarfStackPtr, true)));
884
  }
885

886
  // Update Live In information
887
  fullyRecomputeLiveIns({tailMBB, testMBB});
888
}
889

890
void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
891
    MachineFunction &MF, MachineBasicBlock &MBB,
892
    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
893
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
894
  assert(STI.is64Bit() && "different expansion needed for 32 bit");
895
  assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
896
  const TargetInstrInfo &TII = *STI.getInstrInfo();
897
  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
898

899
  assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
900
             MachineBasicBlock::LQR_Live &&
901
         "Inline stack probe loop will clobber live EFLAGS.");
902

903
  // RAX contains the number of bytes of desired stack adjustment.
904
  // The handling here assumes this value has already been updated so as to
905
  // maintain stack alignment.
906
  //
907
  // We need to exit with RSP modified by this amount and execute suitable
908
  // page touches to notify the OS that we're growing the stack responsibly.
909
  // All stack probing must be done without modifying RSP.
910
  //
911
  // MBB:
912
  //    SizeReg = RAX;
913
  //    ZeroReg = 0
914
  //    CopyReg = RSP
915
  //    Flags, TestReg = CopyReg - SizeReg
916
  //    FinalReg = !Flags.Ovf ? TestReg : ZeroReg
917
  //    LimitReg = gs magic thread env access
918
  //    if FinalReg >= LimitReg goto ContinueMBB
919
  // RoundBB:
920
  //    RoundReg = page address of FinalReg
921
  // LoopMBB:
922
  //    LoopReg = PHI(LimitReg,ProbeReg)
923
  //    ProbeReg = LoopReg - PageSize
924
  //    [ProbeReg] = 0
925
  //    if (ProbeReg > RoundReg) goto LoopMBB
926
  // ContinueMBB:
927
  //    RSP = RSP - RAX
928
  //    [rest of original MBB]
929

930
  // Set up the new basic blocks
931
  MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
932
  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
933
  MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
934

935
  MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
936
  MF.insert(MBBIter, RoundMBB);
937
  MF.insert(MBBIter, LoopMBB);
938
  MF.insert(MBBIter, ContinueMBB);
939

940
  // Split MBB and move the tail portion down to ContinueMBB.
941
  MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
942
  ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
943
  ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
944

945
  // Some useful constants
946
  const int64_t ThreadEnvironmentStackLimit = 0x10;
947
  const int64_t PageSize = 0x1000;
948
  const int64_t PageMask = ~(PageSize - 1);
949

950
  // Registers we need. For the normal case we use virtual
951
  // registers. For the prolog expansion we use RAX, RCX and RDX.
952
  MachineRegisterInfo &MRI = MF.getRegInfo();
953
  const TargetRegisterClass *RegClass = &X86::GR64RegClass;
954
  const Register
955
      SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),
956
      ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
957
      CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
958
      TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
959
      FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
960
      RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
961
      LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
962
      JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
963
      ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);
964

965
  // SP-relative offsets where we can save RCX and RDX.
966
  int64_t RCXShadowSlot = 0;
967
  int64_t RDXShadowSlot = 0;
968

969
  // If inlining in the prolog, save RCX and RDX.
970
  if (InProlog) {
971
    // Compute the offsets. We need to account for things already
972
    // pushed onto the stack at this point: return address, frame
973
    // pointer (if used), and callee saves.
974
    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
975
    const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
976
    const bool HasFP = hasFP(MF);
977

978
    // Check if we need to spill RCX and/or RDX.
979
    // Here we assume that no earlier prologue instruction changes RCX and/or
980
    // RDX, so checking the block live-ins is enough.
981
    const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX);
982
    const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX);
983
    int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
984
    // Assign the initial slot to both registers, then change RDX's slot if both
985
    // need to be spilled.
986
    if (IsRCXLiveIn)
987
      RCXShadowSlot = InitSlot;
988
    if (IsRDXLiveIn)
989
      RDXShadowSlot = InitSlot;
990
    if (IsRDXLiveIn && IsRCXLiveIn)
991
      RDXShadowSlot += 8;
992
    // Emit the saves if needed.
993
    if (IsRCXLiveIn)
994
      addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
995
                   RCXShadowSlot)
996
          .addReg(X86::RCX);
997
    if (IsRDXLiveIn)
998
      addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
999
                   RDXShadowSlot)
1000
          .addReg(X86::RDX);
1001
  } else {
1002
    // Not in the prolog. Copy RAX to a virtual reg.
1003
    BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
1004
  }
1005

1006
  // Add code to MBB to check for overflow and set the new target stack pointer
1007
  // to zero if so.
1008
  BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
1009
      .addReg(ZeroReg, RegState::Undef)
1010
      .addReg(ZeroReg, RegState::Undef);
1011
  BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
1012
  BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
1013
      .addReg(CopyReg)
1014
      .addReg(SizeReg);
1015
  BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)
1016
      .addReg(TestReg)
1017
      .addReg(ZeroReg)
1018
      .addImm(X86::COND_B);
1019

1020
  // FinalReg now holds final stack pointer value, or zero if
1021
  // allocation would overflow. Compare against the current stack
1022
  // limit from the thread environment block. Note this limit is the
1023
  // lowest touched page on the stack, not the point at which the OS
1024
  // will cause an overflow exception, so this is just an optimization
1025
  // to avoid unnecessarily touching pages that are below the current
1026
  // SP but already committed to the stack by the OS.
1027
  BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
1028
      .addReg(0)
1029
      .addImm(1)
1030
      .addReg(0)
1031
      .addImm(ThreadEnvironmentStackLimit)
1032
      .addReg(X86::GS);
1033
  BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
1034
  // Jump if the desired stack pointer is at or above the stack limit.
1035
  BuildMI(&MBB, DL, TII.get(X86::JCC_1))
1036
      .addMBB(ContinueMBB)
1037
      .addImm(X86::COND_AE);
1038

1039
  // Add code to roundMBB to round the final stack pointer to a page boundary.
1040
  RoundMBB->addLiveIn(FinalReg);
1041
  BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
1042
      .addReg(FinalReg)
1043
      .addImm(PageMask);
1044
  BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
1045

1046
  // LimitReg now holds the current stack limit, RoundedReg page-rounded
1047
  // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
1048
  // and probe until we reach RoundedReg.
1049
  if (!InProlog) {
1050
    BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
1051
        .addReg(LimitReg)
1052
        .addMBB(RoundMBB)
1053
        .addReg(ProbeReg)
1054
        .addMBB(LoopMBB);
1055
  }
1056

1057
  LoopMBB->addLiveIn(JoinReg);
1058
  addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
1059
               false, -PageSize);
1060

1061
  // Probe by storing a byte onto the stack.
1062
  BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
1063
      .addReg(ProbeReg)
1064
      .addImm(1)
1065
      .addReg(0)
1066
      .addImm(0)
1067
      .addReg(0)
1068
      .addImm(0);
1069

1070
  LoopMBB->addLiveIn(RoundedReg);
1071
  BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
1072
      .addReg(RoundedReg)
1073
      .addReg(ProbeReg);
1074
  BuildMI(LoopMBB, DL, TII.get(X86::JCC_1))
1075
      .addMBB(LoopMBB)
1076
      .addImm(X86::COND_NE);
1077

1078
  MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
1079

1080
  // If in prolog, restore RDX and RCX.
1081
  if (InProlog) {
1082
    if (RCXShadowSlot) // It means we spilled RCX in the prologue.
1083
      addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
1084
                           TII.get(X86::MOV64rm), X86::RCX),
1085
                   X86::RSP, false, RCXShadowSlot);
1086
    if (RDXShadowSlot) // It means we spilled RDX in the prologue.
1087
      addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
1088
                           TII.get(X86::MOV64rm), X86::RDX),
1089
                   X86::RSP, false, RDXShadowSlot);
1090
  }
1091

1092
  // Now that the probing is done, add code to continueMBB to update
1093
  // the stack pointer for real.
1094
  ContinueMBB->addLiveIn(SizeReg);
1095
  BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
1096
      .addReg(X86::RSP)
1097
      .addReg(SizeReg);
1098

1099
  // Add the control flow edges we need.
1100
  MBB.addSuccessor(ContinueMBB);
1101
  MBB.addSuccessor(RoundMBB);
1102
  RoundMBB->addSuccessor(LoopMBB);
1103
  LoopMBB->addSuccessor(ContinueMBB);
1104
  LoopMBB->addSuccessor(LoopMBB);
1105

1106
  // Mark all the instructions added to the prolog as frame setup.
1107
  if (InProlog) {
1108
    for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
1109
      BeforeMBBI->setFlag(MachineInstr::FrameSetup);
1110
    }
1111
    for (MachineInstr &MI : *RoundMBB) {
1112
      MI.setFlag(MachineInstr::FrameSetup);
1113
    }
1114
    for (MachineInstr &MI : *LoopMBB) {
1115
      MI.setFlag(MachineInstr::FrameSetup);
1116
    }
1117
    for (MachineInstr &MI :
1118
         llvm::make_range(ContinueMBB->begin(), ContinueMBBI)) {
1119
      MI.setFlag(MachineInstr::FrameSetup);
1120
    }
1121
  }
1122
}
1123

1124
void X86FrameLowering::emitStackProbeCall(
1125
    MachineFunction &MF, MachineBasicBlock &MBB,
1126
    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
1127
    std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
1128
  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
1129

1130
  // FIXME: Add indirect thunk support and remove this.
1131
  if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
1132
    report_fatal_error("Emitting stack probe calls on 64-bit with the large "
1133
                       "code model and indirect thunks not yet implemented.");
1134

1135
  assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
1136
             MachineBasicBlock::LQR_Live &&
1137
         "Stack probe calls will clobber live EFLAGS.");
1138

1139
  unsigned CallOp;
1140
  if (Is64Bit)
1141
    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
1142
  else
1143
    CallOp = X86::CALLpcrel32;
1144

1145
  StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
1146

1147
  MachineInstrBuilder CI;
1148
  MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
1149

1150
  // All current stack probes take AX and SP as input, clobber flags, and
1151
  // preserve all registers. x86_64 probes leave RSP unmodified.
1152
  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
1153
    // For the large code model, we have to call through a register. Use R11,
1154
    // as it is scratch in all supported calling conventions.
1155
    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
1156
        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
1157
    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
1158
  } else {
1159
    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
1160
             .addExternalSymbol(MF.createExternalSymbolName(Symbol));
1161
  }
1162

1163
  unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
1164
  unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;
1165
  CI.addReg(AX, RegState::Implicit)
1166
      .addReg(SP, RegState::Implicit)
1167
      .addReg(AX, RegState::Define | RegState::Implicit)
1168
      .addReg(SP, RegState::Define | RegState::Implicit)
1169
      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
1170

1171
  MachineInstr *ModInst = CI;
1172
  if (STI.isTargetWin64() || !STI.isOSWindows()) {
1173
    // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
1174
    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
1175
    // themselves. They also does not clobber %rax so we can reuse it when
1176
    // adjusting %rsp.
1177
    // All other platforms do not specify a particular ABI for the stack probe
1178
    // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
1179
    ModInst =
1180
        BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
1181
            .addReg(SP)
1182
            .addReg(AX);
1183
  }
1184

1185
  // DebugInfo variable locations -- if there's an instruction number for the
1186
  // allocation (i.e., DYN_ALLOC_*), substitute it for the instruction that
1187
  // modifies SP.
1188
  if (InstrNum) {
1189
    if (STI.isTargetWin64() || !STI.isOSWindows()) {
1190
      // Label destination operand of the subtract.
1191
      MF.makeDebugValueSubstitution(*InstrNum,
1192
                                    {ModInst->getDebugInstrNum(), 0});
1193
    } else {
1194
      // Label the call. The operand number is the penultimate operand, zero
1195
      // based.
1196
      unsigned SPDefOperand = ModInst->getNumOperands() - 2;
1197
      MF.makeDebugValueSubstitution(
1198
          *InstrNum, {ModInst->getDebugInstrNum(), SPDefOperand});
1199
    }
1200
  }
1201

1202
  if (InProlog) {
1203
    // Apply the frame setup flag to all inserted instrs.
1204
    for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
1205
      ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
1206
  }
1207
}
1208

1209
static unsigned calculateSetFPREG(uint64_t SPAdjust) {
1210
  // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
1211
  // and might require smaller successive adjustments.
1212
  const uint64_t Win64MaxSEHOffset = 128;
1213
  uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
1214
  // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
1215
  return SEHFrameOffset & -16;
1216
}
1217

1218
// If we're forcing a stack realignment we can't rely on just the frame
1219
// info, we need to know the ABI stack alignment as well in case we
1220
// have a call out.  Otherwise just make sure we have some alignment - we'll
1221
// go with the minimum SlotSize.
1222
uint64_t
1223
X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
1224
  const MachineFrameInfo &MFI = MF.getFrameInfo();
1225
  Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.
1226
  Align StackAlign = getStackAlign();
1227
  bool HasRealign = MF.getFunction().hasFnAttribute("stackrealign");
1228
  if (HasRealign) {
1229
    if (MFI.hasCalls())
1230
      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
1231
    else if (MaxAlign < SlotSize)
1232
      MaxAlign = Align(SlotSize);
1233
  }
1234

1235
  if (!Is64Bit && MF.getFunction().getCallingConv() == CallingConv::X86_INTR) {
1236
    if (HasRealign)
1237
      MaxAlign = (MaxAlign > 16) ? MaxAlign : Align(16);
1238
    else
1239
      MaxAlign = Align(16);
1240
  }
1241
  return MaxAlign.value();
1242
}
1243

1244
void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
1245
                                          MachineBasicBlock::iterator MBBI,
1246
                                          const DebugLoc &DL, unsigned Reg,
1247
                                          uint64_t MaxAlign) const {
1248
  uint64_t Val = -MaxAlign;
1249
  unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
1250

1251
  MachineFunction &MF = *MBB.getParent();
1252
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
1253
  const X86TargetLowering &TLI = *STI.getTargetLowering();
1254
  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
1255
  const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
1256

1257
  // We want to make sure that (in worst case) less than StackProbeSize bytes
1258
  // are not probed after the AND. This assumption is used in
1259
  // emitStackProbeInlineGeneric.
1260
  if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
1261
    {
1262
      NumFrameLoopProbe++;
1263
      MachineBasicBlock *entryMBB =
1264
          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1265
      MachineBasicBlock *headMBB =
1266
          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1267
      MachineBasicBlock *bodyMBB =
1268
          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1269
      MachineBasicBlock *footMBB =
1270
          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1271

1272
      MachineFunction::iterator MBBIter = MBB.getIterator();
1273
      MF.insert(MBBIter, entryMBB);
1274
      MF.insert(MBBIter, headMBB);
1275
      MF.insert(MBBIter, bodyMBB);
1276
      MF.insert(MBBIter, footMBB);
1277
      const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
1278
      Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
1279
                                  : Is64Bit         ? X86::R11D
1280
                                                    : X86::EAX;
1281

1282
      // Setup entry block
1283
      {
1284

1285
        entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);
1286
        BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
1287
            .addReg(StackPtr)
1288
            .setMIFlag(MachineInstr::FrameSetup);
1289
        MachineInstr *MI =
1290
            BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
1291
                .addReg(FinalStackProbed)
1292
                .addImm(Val)
1293
                .setMIFlag(MachineInstr::FrameSetup);
1294

1295
        // The EFLAGS implicit def is dead.
1296
        MI->getOperand(3).setIsDead();
1297

1298
        BuildMI(entryMBB, DL,
1299
                TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1300
            .addReg(FinalStackProbed)
1301
            .addReg(StackPtr)
1302
            .setMIFlag(MachineInstr::FrameSetup);
1303
        BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
1304
            .addMBB(&MBB)
1305
            .addImm(X86::COND_E)
1306
            .setMIFlag(MachineInstr::FrameSetup);
1307
        entryMBB->addSuccessor(headMBB);
1308
        entryMBB->addSuccessor(&MBB);
1309
      }
1310

1311
      // Loop entry block
1312

1313
      {
1314
        const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
1315
        BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
1316
            .addReg(StackPtr)
1317
            .addImm(StackProbeSize)
1318
            .setMIFlag(MachineInstr::FrameSetup);
1319

1320
        BuildMI(headMBB, DL,
1321
                TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1322
            .addReg(StackPtr)
1323
            .addReg(FinalStackProbed)
1324
            .setMIFlag(MachineInstr::FrameSetup);
1325

1326
        // jump to the footer if StackPtr < FinalStackProbed
1327
        BuildMI(headMBB, DL, TII.get(X86::JCC_1))
1328
            .addMBB(footMBB)
1329
            .addImm(X86::COND_B)
1330
            .setMIFlag(MachineInstr::FrameSetup);
1331

1332
        headMBB->addSuccessor(bodyMBB);
1333
        headMBB->addSuccessor(footMBB);
1334
      }
1335

1336
      // setup loop body
1337
      {
1338
        addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
1339
                         .setMIFlag(MachineInstr::FrameSetup),
1340
                     StackPtr, false, 0)
1341
            .addImm(0)
1342
            .setMIFlag(MachineInstr::FrameSetup);
1343

1344
        const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
1345
        BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
1346
            .addReg(StackPtr)
1347
            .addImm(StackProbeSize)
1348
            .setMIFlag(MachineInstr::FrameSetup);
1349

1350
        // cmp with stack pointer bound
1351
        BuildMI(bodyMBB, DL,
1352
                TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1353
            .addReg(FinalStackProbed)
1354
            .addReg(StackPtr)
1355
            .setMIFlag(MachineInstr::FrameSetup);
1356

1357
        // jump back while FinalStackProbed < StackPtr
1358
        BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
1359
            .addMBB(bodyMBB)
1360
            .addImm(X86::COND_B)
1361
            .setMIFlag(MachineInstr::FrameSetup);
1362
        bodyMBB->addSuccessor(bodyMBB);
1363
        bodyMBB->addSuccessor(footMBB);
1364
      }
1365

1366
      // setup loop footer
1367
      {
1368
        BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
1369
            .addReg(FinalStackProbed)
1370
            .setMIFlag(MachineInstr::FrameSetup);
1371
        addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
1372
                         .setMIFlag(MachineInstr::FrameSetup),
1373
                     StackPtr, false, 0)
1374
            .addImm(0)
1375
            .setMIFlag(MachineInstr::FrameSetup);
1376
        footMBB->addSuccessor(&MBB);
1377
      }
1378

1379
      fullyRecomputeLiveIns({footMBB, bodyMBB, headMBB, &MBB});
1380
    }
1381
  } else {
1382
    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
1383
                           .addReg(Reg)
1384
                           .addImm(Val)
1385
                           .setMIFlag(MachineInstr::FrameSetup);
1386

1387
    // The EFLAGS implicit def is dead.
1388
    MI->getOperand(3).setIsDead();
1389
  }
1390
}
1391

1392
bool X86FrameLowering::has128ByteRedZone(const MachineFunction &MF) const {
1393
  // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
1394
  // clobbered by any interrupt handler.
1395
  assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
1396
         "MF used frame lowering for wrong subtarget");
1397
  const Function &Fn = MF.getFunction();
1398
  const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
1399
  return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
1400
}
1401

1402
/// Return true if we need to use the restricted Windows x64 prologue and
1403
/// epilogue code patterns that can be described with WinCFI (.seh_*
1404
/// directives).
1405
bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const {
1406
  return MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1407
}
1408

1409
bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {
1410
  return !isWin64Prologue(MF) && MF.needsFrameMoves();
1411
}
1412

1413
/// Return true if an opcode is part of the REP group of instructions
1414
static bool isOpcodeRep(unsigned Opcode) {
1415
  switch (Opcode) {
1416
  case X86::REPNE_PREFIX:
1417
  case X86::REP_MOVSB_32:
1418
  case X86::REP_MOVSB_64:
1419
  case X86::REP_MOVSD_32:
1420
  case X86::REP_MOVSD_64:
1421
  case X86::REP_MOVSQ_32:
1422
  case X86::REP_MOVSQ_64:
1423
  case X86::REP_MOVSW_32:
1424
  case X86::REP_MOVSW_64:
1425
  case X86::REP_PREFIX:
1426
  case X86::REP_STOSB_32:
1427
  case X86::REP_STOSB_64:
1428
  case X86::REP_STOSD_32:
1429
  case X86::REP_STOSD_64:
1430
  case X86::REP_STOSQ_32:
1431
  case X86::REP_STOSQ_64:
1432
  case X86::REP_STOSW_32:
1433
  case X86::REP_STOSW_64:
1434
    return true;
1435
  default:
1436
    break;
1437
  }
1438
  return false;
1439
}
1440

1441
/// emitPrologue - Push callee-saved registers onto the stack, which
1442
/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
1443
/// space for local variables. Also emit labels used by the exception handler to
1444
/// generate the exception handling frames.
1445

1446
/*
1447
  Here's a gist of what gets emitted:
1448

1449
  ; Establish frame pointer, if needed
1450
  [if needs FP]
1451
      push  %rbp
1452
      .cfi_def_cfa_offset 16
1453
      .cfi_offset %rbp, -16
1454
      .seh_pushreg %rpb
1455
      mov  %rsp, %rbp
1456
      .cfi_def_cfa_register %rbp
1457

1458
  ; Spill general-purpose registers
1459
  [for all callee-saved GPRs]
1460
      pushq %<reg>
1461
      [if not needs FP]
1462
         .cfi_def_cfa_offset (offset from RETADDR)
1463
      .seh_pushreg %<reg>
1464

1465
  ; If the required stack alignment > default stack alignment
1466
  ; rsp needs to be re-aligned.  This creates a "re-alignment gap"
1467
  ; of unknown size in the stack frame.
1468
  [if stack needs re-alignment]
1469
      and  $MASK, %rsp
1470

1471
  ; Allocate space for locals
1472
  [if target is Windows and allocated space > 4096 bytes]
1473
      ; Windows needs special care for allocations larger
1474
      ; than one page.
1475
      mov $NNN, %rax
1476
      call ___chkstk_ms/___chkstk
1477
      sub  %rax, %rsp
1478
  [else]
1479
      sub  $NNN, %rsp
1480

1481
  [if needs FP]
1482
      .seh_stackalloc (size of XMM spill slots)
1483
      .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
1484
  [else]
1485
      .seh_stackalloc NNN
1486

1487
  ; Spill XMMs
1488
  ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
1489
  ; they may get spilled on any platform, if the current function
1490
  ; calls @llvm.eh.unwind.init
1491
  [if needs FP]
1492
      [for all callee-saved XMM registers]
1493
          movaps  %<xmm reg>, -MMM(%rbp)
1494
      [for all callee-saved XMM registers]
1495
          .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
1496
              ; i.e. the offset relative to (%rbp - SEHFrameOffset)
1497
  [else]
1498
      [for all callee-saved XMM registers]
1499
          movaps  %<xmm reg>, KKK(%rsp)
1500
      [for all callee-saved XMM registers]
1501
          .seh_savexmm %<xmm reg>, KKK
1502

1503
  .seh_endprologue
1504

1505
  [if needs base pointer]
1506
      mov  %rsp, %rbx
1507
      [if needs to restore base pointer]
1508
          mov %rsp, -MMM(%rbp)
1509

1510
  ; Emit CFI info
1511
  [if needs FP]
1512
      [for all callee-saved registers]
1513
          .cfi_offset %<reg>, (offset from %rbp)
1514
  [else]
1515
       .cfi_def_cfa_offset (offset from RETADDR)
1516
      [for all callee-saved registers]
1517
          .cfi_offset %<reg>, (offset from %rsp)
1518

1519
  Notes:
1520
  - .seh directives are emitted only for Windows 64 ABI
1521
  - .cv_fpo directives are emitted on win32 when emitting CodeView
1522
  - .cfi directives are emitted for all other ABIs
1523
  - for 32-bit code, substitute %e?? registers for %r??
1524
*/
1525

1526
void X86FrameLowering::emitPrologue(MachineFunction &MF,
1527
                                    MachineBasicBlock &MBB) const {
1528
  assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
1529
         "MF used frame lowering for wrong subtarget");
1530
  MachineBasicBlock::iterator MBBI = MBB.begin();
1531
  MachineFrameInfo &MFI = MF.getFrameInfo();
1532
  const Function &Fn = MF.getFunction();
1533
  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
1534
  uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
1535
  uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
1536
  bool IsFunclet = MBB.isEHFuncletEntry();
1537
  EHPersonality Personality = EHPersonality::Unknown;
1538
  if (Fn.hasPersonalityFn())
1539
    Personality = classifyEHPersonality(Fn.getPersonalityFn());
1540
  bool FnHasClrFunclet =
1541
      MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
1542
  bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
1543
  bool HasFP = hasFP(MF);
1544
  bool IsWin64Prologue = isWin64Prologue(MF);
1545
  bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
1546
  // FIXME: Emit FPO data for EH funclets.
1547
  bool NeedsWinFPO = !IsFunclet && STI.isTargetWin32() &&
1548
                     MF.getFunction().getParent()->getCodeViewFlag();
1549
  bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
1550
  bool NeedsDwarfCFI = needsDwarfCFI(MF);
1551
  Register FramePtr = TRI->getFrameRegister(MF);
1552
  const Register MachineFramePtr =
1553
      STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
1554
                               : FramePtr;
1555
  Register BasePtr = TRI->getBaseRegister();
1556
  bool HasWinCFI = false;
1557

1558
  // Debug location must be unknown since the first debug location is used
1559
  // to determine the end of the prologue.
1560
  DebugLoc DL;
1561
  Register ArgBaseReg;
1562

1563
  // Emit extra prolog for argument stack slot reference.
1564
  if (auto *MI = X86FI->getStackPtrSaveMI()) {
1565
    // MI is lea instruction that created in X86ArgumentStackSlotPass.
1566
    // Creat extra prolog for stack realignment.
1567
    ArgBaseReg = MI->getOperand(0).getReg();
1568
    // leal    4(%esp), %basereg
1569
    // .cfi_def_cfa %basereg, 0
1570
    // andl    $-128, %esp
1571
    // pushl   -4(%basereg)
1572
    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::LEA64r : X86::LEA32r),
1573
            ArgBaseReg)
1574
        .addUse(StackPtr)
1575
        .addImm(1)
1576
        .addUse(X86::NoRegister)
1577
        .addImm(SlotSize)
1578
        .addUse(X86::NoRegister)
1579
        .setMIFlag(MachineInstr::FrameSetup);
1580
    if (NeedsDwarfCFI) {
1581
      // .cfi_def_cfa %basereg, 0
1582
      unsigned DwarfStackPtr = TRI->getDwarfRegNum(ArgBaseReg, true);
1583
      BuildCFI(MBB, MBBI, DL,
1584
               MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, 0),
1585
               MachineInstr::FrameSetup);
1586
    }
1587
    BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
1588
    int64_t Offset = -(int64_t)SlotSize;
1589
    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm))
1590
        .addReg(ArgBaseReg)
1591
        .addImm(1)
1592
        .addReg(X86::NoRegister)
1593
        .addImm(Offset)
1594
        .addReg(X86::NoRegister)
1595
        .setMIFlag(MachineInstr::FrameSetup);
1596
  }
1597

1598
  // Space reserved for stack-based arguments when making a (ABI-guaranteed)
1599
  // tail call.
1600
  unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
1601
  if (TailCallArgReserveSize && IsWin64Prologue)
1602
    report_fatal_error("Can't handle guaranteed tail call under win64 yet");
1603

1604
  const bool EmitStackProbeCall =
1605
      STI.getTargetLowering()->hasStackProbeSymbol(MF);
1606
  unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
1607

1608
  if (HasFP && X86FI->hasSwiftAsyncContext()) {
1609
    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1610
    case SwiftAsyncFramePointerMode::DeploymentBased:
1611
      if (STI.swiftAsyncContextIsDynamicallySet()) {
1612
        // The special symbol below is absolute and has a *value* suitable to be
1613
        // combined with the frame pointer directly.
1614
        BuildMI(MBB, MBBI, DL, TII.get(X86::OR64rm), MachineFramePtr)
1615
            .addUse(MachineFramePtr)
1616
            .addUse(X86::RIP)
1617
            .addImm(1)
1618
            .addUse(X86::NoRegister)
1619
            .addExternalSymbol("swift_async_extendedFramePointerFlags",
1620
                               X86II::MO_GOTPCREL)
1621
            .addUse(X86::NoRegister);
1622
        break;
1623
      }
1624
      [[fallthrough]];
1625

1626
    case SwiftAsyncFramePointerMode::Always:
1627
      assert(
1628
          !IsWin64Prologue &&
1629
          "win64 prologue does not set the bit 60 in the saved frame pointer");
1630
      BuildMI(MBB, MBBI, DL, TII.get(X86::BTS64ri8), MachineFramePtr)
1631
          .addUse(MachineFramePtr)
1632
          .addImm(60)
1633
          .setMIFlag(MachineInstr::FrameSetup);
1634
      break;
1635

1636
    case SwiftAsyncFramePointerMode::Never:
1637
      break;
1638
    }
1639
  }
1640

1641
  // Re-align the stack on 64-bit if the x86-interrupt calling convention is
1642
  // used and an error code was pushed, since the x86-64 ABI requires a 16-byte
1643
  // stack alignment.
1644
  if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
1645
      Fn.arg_size() == 2) {
1646
    StackSize += 8;
1647
    MFI.setStackSize(StackSize);
1648

1649
    // Update the stack pointer by pushing a register. This is the instruction
1650
    // emitted that would be end up being emitted by a call to `emitSPUpdate`.
1651
    // Hard-coding the update to a push avoids emitting a second
1652
    // `STACKALLOC_W_PROBING` instruction in the save block: We know that stack
1653
    // probing isn't needed anyways for an 8-byte update.
1654
    // Pushing a register leaves us in a similar situation to a regular
1655
    // function call where we know that the address at (rsp-8) is writeable.
1656
    // That way we avoid any off-by-ones with stack probing for additional
1657
    // stack pointer updates later on.
1658
    BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1659
        .addReg(X86::RAX, RegState::Undef)
1660
        .setMIFlag(MachineInstr::FrameSetup);
1661
  }
1662

1663
  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
1664
  // function, and use up to 128 bytes of stack space, don't have a frame
1665
  // pointer, calls, or dynamic alloca then we do not need to adjust the
1666
  // stack pointer (we fit in the Red Zone). We also check that we don't
1667
  // push and pop from the stack.
1668
  if (has128ByteRedZone(MF) && !TRI->hasStackRealignment(MF) &&
1669
      !MFI.hasVarSizedObjects() &&             // No dynamic alloca.
1670
      !MFI.adjustsStack() &&                   // No calls.
1671
      !EmitStackProbeCall &&                   // No stack probes.
1672
      !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
1673
      !MF.shouldSplitStack()) {                // Regular stack
1674
    uint64_t MinSize =
1675
        X86FI->getCalleeSavedFrameSize() - X86FI->getTCReturnAddrDelta();
1676
    if (HasFP)
1677
      MinSize += SlotSize;
1678
    X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
1679
    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
1680
    MFI.setStackSize(StackSize);
1681
  }
1682

1683
  // Insert stack pointer adjustment for later moving of return addr.  Only
1684
  // applies to tail call optimized functions where the callee argument stack
1685
  // size is bigger than the callers.
1686
  if (TailCallArgReserveSize != 0) {
1687
    BuildStackAdjustment(MBB, MBBI, DL, -(int)TailCallArgReserveSize,
1688
                         /*InEpilogue=*/false)
1689
        .setMIFlag(MachineInstr::FrameSetup);
1690
  }
1691

1692
  // Mapping for machine moves:
1693
  //
1694
  //   DST: VirtualFP AND
1695
  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
1696
  //        ELSE                        => DW_CFA_def_cfa
1697
  //
1698
  //   SRC: VirtualFP AND
1699
  //        DST: Register               => DW_CFA_def_cfa_register
1700
  //
1701
  //   ELSE
1702
  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
1703
  //        REG < 64                    => DW_CFA_offset + Reg
1704
  //        ELSE                        => DW_CFA_offset_extended
1705

1706
  uint64_t NumBytes = 0;
1707
  int stackGrowth = -SlotSize;
1708

1709
  // Find the funclet establisher parameter
1710
  Register Establisher = X86::NoRegister;
1711
  if (IsClrFunclet)
1712
    Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
1713
  else if (IsFunclet)
1714
    Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
1715

1716
  if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
1717
    // Immediately spill establisher into the home slot.
1718
    // The runtime cares about this.
1719
    // MOV64mr %rdx, 16(%rsp)
1720
    unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
1721
    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
1722
        .addReg(Establisher)
1723
        .setMIFlag(MachineInstr::FrameSetup);
1724
    MBB.addLiveIn(Establisher);
1725
  }
1726

1727
  if (HasFP) {
1728
    assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");
1729

1730
    // Calculate required stack adjustment.
1731
    uint64_t FrameSize = StackSize - SlotSize;
1732
    NumBytes =
1733
        FrameSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
1734

1735
    // Callee-saved registers are pushed on stack before the stack is realigned.
1736
    if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
1737
      NumBytes = alignTo(NumBytes, MaxAlign);
1738

1739
    // Save EBP/RBP into the appropriate stack slot.
1740
    BuildMI(MBB, MBBI, DL,
1741
            TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
1742
        .addReg(MachineFramePtr, RegState::Kill)
1743
        .setMIFlag(MachineInstr::FrameSetup);
1744

1745
    if (NeedsDwarfCFI && !ArgBaseReg.isValid()) {
1746
      // Mark the place where EBP/RBP was saved.
1747
      // Define the current CFA rule to use the provided offset.
1748
      assert(StackSize);
1749
      BuildCFI(MBB, MBBI, DL,
1750
               MCCFIInstruction::cfiDefCfaOffset(
1751
                   nullptr, -2 * stackGrowth + (int)TailCallArgReserveSize),
1752
               MachineInstr::FrameSetup);
1753

1754
      // Change the rule for the FramePtr to be an "offset" rule.
1755
      unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
1756
      BuildCFI(MBB, MBBI, DL,
1757
               MCCFIInstruction::createOffset(nullptr, DwarfFramePtr,
1758
                                              2 * stackGrowth -
1759
                                                  (int)TailCallArgReserveSize),
1760
               MachineInstr::FrameSetup);
1761
    }
1762

1763
    if (NeedsWinCFI) {
1764
      HasWinCFI = true;
1765
      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1766
          .addImm(FramePtr)
1767
          .setMIFlag(MachineInstr::FrameSetup);
1768
    }
1769

1770
    if (!IsFunclet) {
1771
      if (X86FI->hasSwiftAsyncContext()) {
1772
        assert(!IsWin64Prologue &&
1773
               "win64 prologue does not store async context right below rbp");
1774
        const auto &Attrs = MF.getFunction().getAttributes();
1775

1776
        // Before we update the live frame pointer we have to ensure there's a
1777
        // valid (or null) asynchronous context in its slot just before FP in
1778
        // the frame record, so store it now.
1779
        if (Attrs.hasAttrSomewhere(Attribute::SwiftAsync)) {
1780
          // We have an initial context in r14, store it just before the frame
1781
          // pointer.
1782
          MBB.addLiveIn(X86::R14);
1783
          BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1784
              .addReg(X86::R14)
1785
              .setMIFlag(MachineInstr::FrameSetup);
1786
        } else {
1787
          // No initial context, store null so that there's no pointer that
1788
          // could be misused.
1789
          BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64i32))
1790
              .addImm(0)
1791
              .setMIFlag(MachineInstr::FrameSetup);
1792
        }
1793

1794
        if (NeedsWinCFI) {
1795
          HasWinCFI = true;
1796
          BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1797
              .addImm(X86::R14)
1798
              .setMIFlag(MachineInstr::FrameSetup);
1799
        }
1800

1801
        BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr)
1802
            .addUse(X86::RSP)
1803
            .addImm(1)
1804
            .addUse(X86::NoRegister)
1805
            .addImm(8)
1806
            .addUse(X86::NoRegister)
1807
            .setMIFlag(MachineInstr::FrameSetup);
1808
        BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64ri32), X86::RSP)
1809
            .addUse(X86::RSP)
1810
            .addImm(8)
1811
            .setMIFlag(MachineInstr::FrameSetup);
1812
      }
1813

1814
      if (!IsWin64Prologue && !IsFunclet) {
1815
        // Update EBP with the new base value.
1816
        if (!X86FI->hasSwiftAsyncContext())
1817
          BuildMI(MBB, MBBI, DL,
1818
                  TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
1819
                  FramePtr)
1820
              .addReg(StackPtr)
1821
              .setMIFlag(MachineInstr::FrameSetup);
1822

1823
        if (NeedsDwarfCFI) {
1824
          if (ArgBaseReg.isValid()) {
1825
            SmallString<64> CfaExpr;
1826
            CfaExpr.push_back(dwarf::DW_CFA_expression);
1827
            uint8_t buffer[16];
1828
            unsigned DwarfReg = TRI->getDwarfRegNum(MachineFramePtr, true);
1829
            CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
1830
            CfaExpr.push_back(2);
1831
            CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
1832
            CfaExpr.push_back(0);
1833
            // DW_CFA_expression: reg5 DW_OP_breg5 +0
1834
            BuildCFI(MBB, MBBI, DL,
1835
                     MCCFIInstruction::createEscape(nullptr, CfaExpr.str()),
1836
                     MachineInstr::FrameSetup);
1837
          } else {
1838
            // Mark effective beginning of when frame pointer becomes valid.
1839
            // Define the current CFA to use the EBP/RBP register.
1840
            unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
1841
            BuildCFI(
1842
                MBB, MBBI, DL,
1843
                MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr),
1844
                MachineInstr::FrameSetup);
1845
          }
1846
        }
1847

1848
        if (NeedsWinFPO) {
1849
          // .cv_fpo_setframe $FramePtr
1850
          HasWinCFI = true;
1851
          BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
1852
              .addImm(FramePtr)
1853
              .addImm(0)
1854
              .setMIFlag(MachineInstr::FrameSetup);
1855
        }
1856
      }
1857
    }
1858
  } else {
1859
    assert(!IsFunclet && "funclets without FPs not yet implemented");
1860
    NumBytes =
1861
        StackSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
1862
  }
1863

1864
  // Update the offset adjustment, which is mainly used by codeview to translate
1865
  // from ESP to VFRAME relative local variable offsets.
1866
  if (!IsFunclet) {
1867
    if (HasFP && TRI->hasStackRealignment(MF))
1868
      MFI.setOffsetAdjustment(-NumBytes);
1869
    else
1870
      MFI.setOffsetAdjustment(-StackSize);
1871
  }
1872

1873
  // For EH funclets, only allocate enough space for outgoing calls. Save the
1874
  // NumBytes value that we would've used for the parent frame.
1875
  unsigned ParentFrameNumBytes = NumBytes;
1876
  if (IsFunclet)
1877
    NumBytes = getWinEHFuncletFrameSize(MF);
1878

1879
  // Skip the callee-saved push instructions.
1880
  bool PushedRegs = false;
1881
  int StackOffset = 2 * stackGrowth;
1882
  MachineBasicBlock::const_iterator LastCSPush = MBBI;
1883
  auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
1884
    if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup))
1885
      return false;
1886
    unsigned Opc = MBBI->getOpcode();
1887
    return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||
1888
           Opc == X86::PUSH2 || Opc == X86::PUSH2P;
1889
  };
1890

1891
  while (IsCSPush(MBBI)) {
1892
    PushedRegs = true;
1893
    Register Reg = MBBI->getOperand(0).getReg();
1894
    LastCSPush = MBBI;
1895
    ++MBBI;
1896
    unsigned Opc = LastCSPush->getOpcode();
1897

1898
    if (!HasFP && NeedsDwarfCFI) {
1899
      // Mark callee-saved push instruction.
1900
      // Define the current CFA rule to use the provided offset.
1901
      assert(StackSize);
1902
      // Compared to push, push2 introduces more stack offset (one more
1903
      // register).
1904
      if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1905
        StackOffset += stackGrowth;
1906
      BuildCFI(MBB, MBBI, DL,
1907
               MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
1908
               MachineInstr::FrameSetup);
1909
      StackOffset += stackGrowth;
1910
    }
1911

1912
    if (NeedsWinCFI) {
1913
      HasWinCFI = true;
1914
      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1915
          .addImm(Reg)
1916
          .setMIFlag(MachineInstr::FrameSetup);
1917
      if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1918
        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1919
            .addImm(LastCSPush->getOperand(1).getReg())
1920
            .setMIFlag(MachineInstr::FrameSetup);
1921
    }
1922
  }
1923

1924
  // Realign stack after we pushed callee-saved registers (so that we'll be
1925
  // able to calculate their offsets from the frame pointer).
1926
  // Don't do this for Win64, it needs to realign the stack after the prologue.
1927
  if (!IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF) &&
1928
      !ArgBaseReg.isValid()) {
1929
    assert(HasFP && "There should be a frame pointer if stack is realigned.");
1930
    BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
1931

1932
    if (NeedsWinCFI) {
1933
      HasWinCFI = true;
1934
      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlign))
1935
          .addImm(MaxAlign)
1936
          .setMIFlag(MachineInstr::FrameSetup);
1937
    }
1938
  }
1939

1940
  // If there is an SUB32ri of ESP immediately before this instruction, merge
1941
  // the two. This can be the case when tail call elimination is enabled and
1942
  // the callee has more arguments then the caller.
1943
  NumBytes -= mergeSPUpdates(MBB, MBBI, true);
1944

1945
  // Adjust stack pointer: ESP -= numbytes.
1946

1947
  // Windows and cygwin/mingw require a prologue helper routine when allocating
1948
  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
1949
  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
1950
  // stack and adjust the stack pointer in one go.  The 64-bit version of
1951
  // __chkstk is only responsible for probing the stack.  The 64-bit prologue is
1952
  // responsible for adjusting the stack pointer.  Touching the stack at 4K
1953
  // increments is necessary to ensure that the guard pages used by the OS
1954
  // virtual memory manager are allocated in correct sequence.
1955
  uint64_t AlignedNumBytes = NumBytes;
1956
  if (IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF))
1957
    AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
1958
  if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
1959
    assert(!X86FI->getUsesRedZone() &&
1960
           "The Red Zone is not accounted for in stack probes");
1961

1962
    // Check whether EAX is livein for this block.
1963
    bool isEAXAlive = isEAXLiveIn(MBB);
1964

1965
    if (isEAXAlive) {
1966
      if (Is64Bit) {
1967
        // Save RAX
1968
        BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1969
            .addReg(X86::RAX, RegState::Kill)
1970
            .setMIFlag(MachineInstr::FrameSetup);
1971
      } else {
1972
        // Save EAX
1973
        BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
1974
            .addReg(X86::EAX, RegState::Kill)
1975
            .setMIFlag(MachineInstr::FrameSetup);
1976
      }
1977
    }
1978

1979
    if (Is64Bit) {
1980
      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
1981
      // Function prologue is responsible for adjusting the stack pointer.
1982
      int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
1983
      BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Alloc)), X86::RAX)
1984
          .addImm(Alloc)
1985
          .setMIFlag(MachineInstr::FrameSetup);
1986
    } else {
1987
      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
1988
      // We'll also use 4 already allocated bytes for EAX.
1989
      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
1990
          .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
1991
          .setMIFlag(MachineInstr::FrameSetup);
1992
    }
1993

1994
    // Call __chkstk, __chkstk_ms, or __alloca.
1995
    emitStackProbe(MF, MBB, MBBI, DL, true);
1996

1997
    if (isEAXAlive) {
1998
      // Restore RAX/EAX
1999
      MachineInstr *MI;
2000
      if (Is64Bit)
2001
        MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),
2002
                          StackPtr, false, NumBytes - 8);
2003
      else
2004
        MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
2005
                          StackPtr, false, NumBytes - 4);
2006
      MI->setFlag(MachineInstr::FrameSetup);
2007
      MBB.insert(MBBI, MI);
2008
    }
2009
  } else if (NumBytes) {
2010
    emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /*InEpilogue=*/false);
2011
  }
2012

2013
  if (NeedsWinCFI && NumBytes) {
2014
    HasWinCFI = true;
2015
    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
2016
        .addImm(NumBytes)
2017
        .setMIFlag(MachineInstr::FrameSetup);
2018
  }
2019

2020
  int SEHFrameOffset = 0;
2021
  unsigned SPOrEstablisher;
2022
  if (IsFunclet) {
2023
    if (IsClrFunclet) {
2024
      // The establisher parameter passed to a CLR funclet is actually a pointer
2025
      // to the (mostly empty) frame of its nearest enclosing funclet; we have
2026
      // to find the root function establisher frame by loading the PSPSym from
2027
      // the intermediate frame.
2028
      unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
2029
      MachinePointerInfo NoInfo;
2030
      MBB.addLiveIn(Establisher);
2031
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
2032
                   Establisher, false, PSPSlotOffset)
2033
          .addMemOperand(MF.getMachineMemOperand(
2034
              NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize)));
2035
      ;
2036
      // Save the root establisher back into the current funclet's (mostly
2037
      // empty) frame, in case a sub-funclet or the GC needs it.
2038
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
2039
                   false, PSPSlotOffset)
2040
          .addReg(Establisher)
2041
          .addMemOperand(MF.getMachineMemOperand(
2042
              NoInfo,
2043
              MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
2044
              SlotSize, Align(SlotSize)));
2045
    }
2046
    SPOrEstablisher = Establisher;
2047
  } else {
2048
    SPOrEstablisher = StackPtr;
2049
  }
2050

2051
  if (IsWin64Prologue && HasFP) {
2052
    // Set RBP to a small fixed offset from RSP. In the funclet case, we base
2053
    // this calculation on the incoming establisher, which holds the value of
2054
    // RSP from the parent frame at the end of the prologue.
2055
    SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
2056
    if (SEHFrameOffset)
2057
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
2058
                   SPOrEstablisher, false, SEHFrameOffset);
2059
    else
2060
      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
2061
          .addReg(SPOrEstablisher);
2062

2063
    // If this is not a funclet, emit the CFI describing our frame pointer.
2064
    if (NeedsWinCFI && !IsFunclet) {
2065
      assert(!NeedsWinFPO && "this setframe incompatible with FPO data");
2066
      HasWinCFI = true;
2067
      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
2068
          .addImm(FramePtr)
2069
          .addImm(SEHFrameOffset)
2070
          .setMIFlag(MachineInstr::FrameSetup);
2071
      if (isAsynchronousEHPersonality(Personality))
2072
        MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
2073
    }
2074
  } else if (IsFunclet && STI.is32Bit()) {
2075
    // Reset EBP / ESI to something good for funclets.
2076
    MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
2077
    // If we're a catch funclet, we can be returned to via catchret. Save ESP
2078
    // into the registration node so that the runtime will restore it for us.
2079
    if (!MBB.isCleanupFuncletEntry()) {
2080
      assert(Personality == EHPersonality::MSVC_CXX);
2081
      Register FrameReg;
2082
      int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
2083
      int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
2084
      // ESP is the first field, so no extra displacement is needed.
2085
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
2086
                   false, EHRegOffset)
2087
          .addReg(X86::ESP);
2088
    }
2089
  }
2090

2091
  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
2092
    const MachineInstr &FrameInstr = *MBBI;
2093
    ++MBBI;
2094

2095
    if (NeedsWinCFI) {
2096
      int FI;
2097
      if (Register Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
2098
        if (X86::FR64RegClass.contains(Reg)) {
2099
          int Offset;
2100
          Register IgnoredFrameReg;
2101
          if (IsWin64Prologue && IsFunclet)
2102
            Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
2103
          else
2104
            Offset =
2105
                getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +
2106
                SEHFrameOffset;
2107

2108
          HasWinCFI = true;
2109
          assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
2110
          BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
2111
              .addImm(Reg)
2112
              .addImm(Offset)
2113
              .setMIFlag(MachineInstr::FrameSetup);
2114
        }
2115
      }
2116
    }
2117
  }
2118

2119
  if (NeedsWinCFI && HasWinCFI)
2120
    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
2121
        .setMIFlag(MachineInstr::FrameSetup);
2122

2123
  if (FnHasClrFunclet && !IsFunclet) {
2124
    // Save the so-called Initial-SP (i.e. the value of the stack pointer
2125
    // immediately after the prolog)  into the PSPSlot so that funclets
2126
    // and the GC can recover it.
2127
    unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
2128
    auto PSPInfo = MachinePointerInfo::getFixedStack(
2129
        MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
2130
    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
2131
                 PSPSlotOffset)
2132
        .addReg(StackPtr)
2133
        .addMemOperand(MF.getMachineMemOperand(
2134
            PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
2135
            SlotSize, Align(SlotSize)));
2136
  }
2137

2138
  // Realign stack after we spilled callee-saved registers (so that we'll be
2139
  // able to calculate their offsets from the frame pointer).
2140
  // Win64 requires aligning the stack after the prologue.
2141
  if (IsWin64Prologue && TRI->hasStackRealignment(MF)) {
2142
    assert(HasFP && "There should be a frame pointer if stack is realigned.");
2143
    BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
2144
  }
2145

2146
  // We already dealt with stack realignment and funclets above.
2147
  if (IsFunclet && STI.is32Bit())
2148
    return;
2149

2150
  // If we need a base pointer, set it up here. It's whatever the value
2151
  // of the stack pointer is at this point. Any variable size objects
2152
  // will be allocated after this, so we can still use the base pointer
2153
  // to reference locals.
2154
  if (TRI->hasBasePointer(MF)) {
2155
    // Update the base pointer with the current stack pointer.
2156
    unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
2157
    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
2158
        .addReg(SPOrEstablisher)
2159
        .setMIFlag(MachineInstr::FrameSetup);
2160
    if (X86FI->getRestoreBasePointer()) {
2161
      // Stash value of base pointer.  Saving RSP instead of EBP shortens
2162
      // dependence chain. Used by SjLj EH.
2163
      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
2164
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true,
2165
                   X86FI->getRestoreBasePointerOffset())
2166
          .addReg(SPOrEstablisher)
2167
          .setMIFlag(MachineInstr::FrameSetup);
2168
    }
2169

2170
    if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
2171
      // Stash the value of the frame pointer relative to the base pointer for
2172
      // Win32 EH. This supports Win32 EH, which does the inverse of the above:
2173
      // it recovers the frame pointer from the base pointer rather than the
2174
      // other way around.
2175
      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
2176
      Register UsedReg;
2177
      int Offset =
2178
          getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
2179
              .getFixed();
2180
      assert(UsedReg == BasePtr);
2181
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
2182
          .addReg(FramePtr)
2183
          .setMIFlag(MachineInstr::FrameSetup);
2184
    }
2185
  }
2186
  if (ArgBaseReg.isValid()) {
2187
    // Save argument base pointer.
2188
    auto *MI = X86FI->getStackPtrSaveMI();
2189
    int FI = MI->getOperand(1).getIndex();
2190
    unsigned MOVmr = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
2191
    // movl    %basereg, offset(%ebp)
2192
    addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), FI)
2193
        .addReg(ArgBaseReg)
2194
        .setMIFlag(MachineInstr::FrameSetup);
2195
  }
2196

2197
  if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
2198
    // Mark end of stack pointer adjustment.
2199
    if (!HasFP && NumBytes) {
2200
      // Define the current CFA rule to use the provided offset.
2201
      assert(StackSize);
2202
      BuildCFI(
2203
          MBB, MBBI, DL,
2204
          MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth),
2205
          MachineInstr::FrameSetup);
2206
    }
2207

2208
    // Emit DWARF info specifying the offsets of the callee-saved registers.
2209
    emitCalleeSavedFrameMoves(MBB, MBBI, DL, true);
2210
  }
2211

2212
  // X86 Interrupt handling function cannot assume anything about the direction
2213
  // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
2214
  // in each prologue of interrupt handler function.
2215
  //
2216
  // Create "cld" instruction only in these cases:
2217
  // 1. The interrupt handling function uses any of the "rep" instructions.
2218
  // 2. Interrupt handling function calls another function.
2219
  // 3. If there are any inline asm blocks, as we do not know what they do
2220
  //
2221
  // TODO: We should also emit cld if we detect the use of std, but as of now,
2222
  // the compiler does not even emit that instruction or even define it, so in
2223
  // practice, this would only happen with inline asm, which we cover anyway.
2224
  if (Fn.getCallingConv() == CallingConv::X86_INTR) {
2225
    bool NeedsCLD = false;
2226

2227
    for (const MachineBasicBlock &B : MF) {
2228
      for (const MachineInstr &MI : B) {
2229
        if (MI.isCall()) {
2230
          NeedsCLD = true;
2231
          break;
2232
        }
2233

2234
        if (isOpcodeRep(MI.getOpcode())) {
2235
          NeedsCLD = true;
2236
          break;
2237
        }
2238

2239
        if (MI.isInlineAsm()) {
2240
          // TODO: Parse asm for rep instructions or call sites?
2241
          // For now, let's play it safe and emit a cld instruction
2242
          // just in case.
2243
          NeedsCLD = true;
2244
          break;
2245
        }
2246
      }
2247
    }
2248

2249
    if (NeedsCLD) {
2250
      BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
2251
          .setMIFlag(MachineInstr::FrameSetup);
2252
    }
2253
  }
2254

2255
  // At this point we know if the function has WinCFI or not.
2256
  MF.setHasWinCFI(HasWinCFI);
2257
}
2258

2259
bool X86FrameLowering::canUseLEAForSPInEpilogue(
2260
    const MachineFunction &MF) const {
2261
  // We can't use LEA instructions for adjusting the stack pointer if we don't
2262
  // have a frame pointer in the Win64 ABI.  Only ADD instructions may be used
2263
  // to deallocate the stack.
2264
  // This means that we can use LEA for SP in two situations:
2265
  // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
2266
  // 2. We *have* a frame pointer which means we are permitted to use LEA.
2267
  return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
2268
}
2269

2270
static bool isFuncletReturnInstr(MachineInstr &MI) {
2271
  switch (MI.getOpcode()) {
2272
  case X86::CATCHRET:
2273
  case X86::CLEANUPRET:
2274
    return true;
2275
  default:
2276
    return false;
2277
  }
2278
  llvm_unreachable("impossible");
2279
}
2280

2281
// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
2282
// stack. It holds a pointer to the bottom of the root function frame.  The
2283
// establisher frame pointer passed to a nested funclet may point to the
2284
// (mostly empty) frame of its parent funclet, but it will need to find
2285
// the frame of the root function to access locals.  To facilitate this,
2286
// every funclet copies the pointer to the bottom of the root function
2287
// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
2288
// same offset for the PSPSym in the root function frame that's used in the
2289
// funclets' frames allows each funclet to dynamically accept any ancestor
2290
// frame as its establisher argument (the runtime doesn't guarantee the
2291
// immediate parent for some reason lost to history), and also allows the GC,
2292
// which uses the PSPSym for some bookkeeping, to find it in any funclet's
2293
// frame with only a single offset reported for the entire method.
2294
unsigned
2295
X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
2296
  const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
2297
  Register SPReg;
2298
  int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
2299
                                              /*IgnoreSPUpdates*/ true)
2300
                   .getFixed();
2301
  assert(Offset >= 0 && SPReg == TRI->getStackRegister());
2302
  return static_cast<unsigned>(Offset);
2303
}
2304

2305
unsigned
2306
X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
2307
  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2308
  // This is the size of the pushed CSRs.
2309
  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2310
  // This is the size of callee saved XMMs.
2311
  const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2312
  unsigned XMMSize =
2313
      WinEHXMMSlotInfo.size() * TRI->getSpillSize(X86::VR128RegClass);
2314
  // This is the amount of stack a funclet needs to allocate.
2315
  unsigned UsedSize;
2316
  EHPersonality Personality =
2317
      classifyEHPersonality(MF.getFunction().getPersonalityFn());
2318
  if (Personality == EHPersonality::CoreCLR) {
2319
    // CLR funclets need to hold enough space to include the PSPSym, at the
2320
    // same offset from the stack pointer (immediately after the prolog) as it
2321
    // resides at in the main function.
2322
    UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
2323
  } else {
2324
    // Other funclets just need enough stack for outgoing call arguments.
2325
    UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
2326
  }
2327
  // RBP is not included in the callee saved register block. After pushing RBP,
2328
  // everything is 16 byte aligned. Everything we allocate before an outgoing
2329
  // call must also be 16 byte aligned.
2330
  unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign());
2331
  // Subtract out the size of the callee saved registers. This is how much stack
2332
  // each funclet will allocate.
2333
  return FrameSizeMinusRBP + XMMSize - CSSize;
2334
}
2335

2336
static bool isTailCallOpcode(unsigned Opc) {
2337
  return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
2338
         Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
2339
         Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64;
2340
}
2341

2342
void X86FrameLowering::emitEpilogue(MachineFunction &MF,
2343
                                    MachineBasicBlock &MBB) const {
2344
  const MachineFrameInfo &MFI = MF.getFrameInfo();
2345
  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2346
  MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();
2347
  MachineBasicBlock::iterator MBBI = Terminator;
2348
  DebugLoc DL;
2349
  if (MBBI != MBB.end())
2350
    DL = MBBI->getDebugLoc();
2351
  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
2352
  const bool Is64BitILP32 = STI.isTarget64BitILP32();
2353
  Register FramePtr = TRI->getFrameRegister(MF);
2354
  Register MachineFramePtr =
2355
      Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
2356

2357
  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
2358
  bool NeedsWin64CFI =
2359
      IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();
2360
  bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);
2361

2362
  // Get the number of bytes to allocate from the FrameInfo.
2363
  uint64_t StackSize = MFI.getStackSize();
2364
  uint64_t MaxAlign = calculateMaxStackAlign(MF);
2365
  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2366
  unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
2367
  bool HasFP = hasFP(MF);
2368
  uint64_t NumBytes = 0;
2369

2370
  bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() &&
2371
                        !MF.getTarget().getTargetTriple().isOSWindows()) &&
2372
                       MF.needsFrameMoves();
2373

2374
  Register ArgBaseReg;
2375
  if (auto *MI = X86FI->getStackPtrSaveMI()) {
2376
    unsigned Opc = X86::LEA32r;
2377
    Register StackReg = X86::ESP;
2378
    ArgBaseReg = MI->getOperand(0).getReg();
2379
    if (STI.is64Bit()) {
2380
      Opc = X86::LEA64r;
2381
      StackReg = X86::RSP;
2382
    }
2383
    // leal    -4(%basereg), %esp
2384
    // .cfi_def_cfa %esp, 4
2385
    BuildMI(MBB, MBBI, DL, TII.get(Opc), StackReg)
2386
        .addUse(ArgBaseReg)
2387
        .addImm(1)
2388
        .addUse(X86::NoRegister)
2389
        .addImm(-(int64_t)SlotSize)
2390
        .addUse(X86::NoRegister)
2391
        .setMIFlag(MachineInstr::FrameDestroy);
2392
    if (NeedsDwarfCFI) {
2393
      unsigned DwarfStackPtr = TRI->getDwarfRegNum(StackReg, true);
2394
      BuildCFI(MBB, MBBI, DL,
2395
               MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),
2396
               MachineInstr::FrameDestroy);
2397
      --MBBI;
2398
    }
2399
    --MBBI;
2400
  }
2401

2402
  if (IsFunclet) {
2403
    assert(HasFP && "EH funclets without FP not yet implemented");
2404
    NumBytes = getWinEHFuncletFrameSize(MF);
2405
  } else if (HasFP) {
2406
    // Calculate required stack adjustment.
2407
    uint64_t FrameSize = StackSize - SlotSize;
2408
    NumBytes = FrameSize - CSSize - TailCallArgReserveSize;
2409

2410
    // Callee-saved registers were pushed on stack before the stack was
2411
    // realigned.
2412
    if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
2413
      NumBytes = alignTo(FrameSize, MaxAlign);
2414
  } else {
2415
    NumBytes = StackSize - CSSize - TailCallArgReserveSize;
2416
  }
2417
  uint64_t SEHStackAllocAmt = NumBytes;
2418

2419
  // AfterPop is the position to insert .cfi_restore.
2420
  MachineBasicBlock::iterator AfterPop = MBBI;
2421
  if (HasFP) {
2422
    if (X86FI->hasSwiftAsyncContext()) {
2423
      // Discard the context.
2424
      int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
2425
      emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
2426
    }
2427
    // Pop EBP.
2428
    BuildMI(MBB, MBBI, DL,
2429
            TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
2430
            MachineFramePtr)
2431
        .setMIFlag(MachineInstr::FrameDestroy);
2432

2433
    // We need to reset FP to its untagged state on return. Bit 60 is currently
2434
    // used to show the presence of an extended frame.
2435
    if (X86FI->hasSwiftAsyncContext()) {
2436
      BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8), MachineFramePtr)
2437
          .addUse(MachineFramePtr)
2438
          .addImm(60)
2439
          .setMIFlag(MachineInstr::FrameDestroy);
2440
    }
2441

2442
    if (NeedsDwarfCFI) {
2443
      if (!ArgBaseReg.isValid()) {
2444
        unsigned DwarfStackPtr =
2445
            TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
2446
        BuildCFI(MBB, MBBI, DL,
2447
                 MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),
2448
                 MachineInstr::FrameDestroy);
2449
      }
2450
      if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
2451
        unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
2452
        BuildCFI(MBB, AfterPop, DL,
2453
                 MCCFIInstruction::createRestore(nullptr, DwarfFramePtr),
2454
                 MachineInstr::FrameDestroy);
2455
        --MBBI;
2456
        --AfterPop;
2457
      }
2458
      --MBBI;
2459
    }
2460
  }
2461

2462
  MachineBasicBlock::iterator FirstCSPop = MBBI;
2463
  // Skip the callee-saved pop instructions.
2464
  while (MBBI != MBB.begin()) {
2465
    MachineBasicBlock::iterator PI = std::prev(MBBI);
2466
    unsigned Opc = PI->getOpcode();
2467

2468
    if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
2469
      if (!PI->getFlag(MachineInstr::FrameDestroy) ||
2470
          (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
2471
           Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
2472
           Opc != X86::POP2P && Opc != X86::LEA64r))
2473
        break;
2474
      FirstCSPop = PI;
2475
    }
2476

2477
    --MBBI;
2478
  }
2479
  if (ArgBaseReg.isValid()) {
2480
    // Restore argument base pointer.
2481
    auto *MI = X86FI->getStackPtrSaveMI();
2482
    int FI = MI->getOperand(1).getIndex();
2483
    unsigned MOVrm = Is64Bit ? X86::MOV64rm : X86::MOV32rm;
2484
    // movl   offset(%ebp), %basereg
2485
    addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVrm), ArgBaseReg), FI)
2486
        .setMIFlag(MachineInstr::FrameDestroy);
2487
  }
2488
  MBBI = FirstCSPop;
2489

2490
  if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)
2491
    emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator);
2492

2493
  if (MBBI != MBB.end())
2494
    DL = MBBI->getDebugLoc();
2495
  // If there is an ADD32ri or SUB32ri of ESP immediately before this
2496
  // instruction, merge the two instructions.
2497
  if (NumBytes || MFI.hasVarSizedObjects())
2498
    NumBytes += mergeSPUpdates(MBB, MBBI, true);
2499

2500
  // If dynamic alloca is used, then reset esp to point to the last callee-saved
2501
  // slot before popping them off! Same applies for the case, when stack was
2502
  // realigned. Don't do this if this was a funclet epilogue, since the funclets
2503
  // will not do realignment or dynamic stack allocation.
2504
  if (((TRI->hasStackRealignment(MF)) || MFI.hasVarSizedObjects()) &&
2505
      !IsFunclet) {
2506
    if (TRI->hasStackRealignment(MF))
2507
      MBBI = FirstCSPop;
2508
    unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
2509
    uint64_t LEAAmount =
2510
        IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
2511

2512
    if (X86FI->hasSwiftAsyncContext())
2513
      LEAAmount -= 16;
2514

2515
    // There are only two legal forms of epilogue:
2516
    // - add SEHAllocationSize, %rsp
2517
    // - lea SEHAllocationSize(%FramePtr), %rsp
2518
    //
2519
    // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
2520
    // However, we may use this sequence if we have a frame pointer because the
2521
    // effects of the prologue can safely be undone.
2522
    if (LEAAmount != 0) {
2523
      unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
2524
      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr,
2525
                   false, LEAAmount);
2526
      --MBBI;
2527
    } else {
2528
      unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
2529
      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr).addReg(FramePtr);
2530
      --MBBI;
2531
    }
2532
  } else if (NumBytes) {
2533
    // Adjust stack pointer back: ESP += numbytes.
2534
    emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
2535
    if (!HasFP && NeedsDwarfCFI) {
2536
      // Define the current CFA rule to use the provided offset.
2537
      BuildCFI(MBB, MBBI, DL,
2538
               MCCFIInstruction::cfiDefCfaOffset(
2539
                   nullptr, CSSize + TailCallArgReserveSize + SlotSize),
2540
               MachineInstr::FrameDestroy);
2541
    }
2542
    --MBBI;
2543
  }
2544

2545
  // Windows unwinder will not invoke function's exception handler if IP is
2546
  // either in prologue or in epilogue.  This behavior causes a problem when a
2547
  // call immediately precedes an epilogue, because the return address points
2548
  // into the epilogue.  To cope with that, we insert an epilogue marker here,
2549
  // then replace it with a 'nop' if it ends up immediately after a CALL in the
2550
  // final emitted code.
2551
  if (NeedsWin64CFI && MF.hasWinCFI())
2552
    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
2553

2554
  if (!HasFP && NeedsDwarfCFI) {
2555
    MBBI = FirstCSPop;
2556
    int64_t Offset = -(int64_t)CSSize - SlotSize;
2557
    // Mark callee-saved pop instruction.
2558
    // Define the current CFA rule to use the provided offset.
2559
    while (MBBI != MBB.end()) {
2560
      MachineBasicBlock::iterator PI = MBBI;
2561
      unsigned Opc = PI->getOpcode();
2562
      ++MBBI;
2563
      if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||
2564
          Opc == X86::POP2 || Opc == X86::POP2P) {
2565
        Offset += SlotSize;
2566
        // Compared to pop, pop2 introduces more stack offset (one more
2567
        // register).
2568
        if (Opc == X86::POP2 || Opc == X86::POP2P)
2569
          Offset += SlotSize;
2570
        BuildCFI(MBB, MBBI, DL,
2571
                 MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
2572
                 MachineInstr::FrameDestroy);
2573
      }
2574
    }
2575
  }
2576

2577
  // Emit DWARF info specifying the restores of the callee-saved registers.
2578
  // For epilogue with return inside or being other block without successor,
2579
  // no need to generate .cfi_restore for callee-saved registers.
2580
  if (NeedsDwarfCFI && !MBB.succ_empty())
2581
    emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false);
2582

2583
  if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
2584
    // Add the return addr area delta back since we are not tail calling.
2585
    int Offset = -1 * X86FI->getTCReturnAddrDelta();
2586
    assert(Offset >= 0 && "TCDelta should never be positive");
2587
    if (Offset) {
2588
      // Check for possible merge with preceding ADD instruction.
2589
      Offset += mergeSPUpdates(MBB, Terminator, true);
2590
      emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
2591
    }
2592
  }
2593

2594
  // Emit tilerelease for AMX kernel.
2595
  if (X86FI->getAMXProgModel() == AMXProgModelEnum::ManagedRA)
2596
    BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
2597
}
2598

2599
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
2600
                                                     int FI,
2601
                                                     Register &FrameReg) const {
2602
  const MachineFrameInfo &MFI = MF.getFrameInfo();
2603

2604
  bool IsFixed = MFI.isFixedObjectIndex(FI);
2605
  // We can't calculate offset from frame pointer if the stack is realigned,
2606
  // so enforce usage of stack/base pointer.  The base pointer is used when we
2607
  // have dynamic allocas in addition to dynamic realignment.
2608
  if (TRI->hasBasePointer(MF))
2609
    FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
2610
  else if (TRI->hasStackRealignment(MF))
2611
    FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
2612
  else
2613
    FrameReg = TRI->getFrameRegister(MF);
2614

2615
  // Offset will hold the offset from the stack pointer at function entry to the
2616
  // object.
2617
  // We need to factor in additional offsets applied during the prologue to the
2618
  // frame, base, and stack pointer depending on which is used.
2619
  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
2620
  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2621
  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2622
  uint64_t StackSize = MFI.getStackSize();
2623
  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
2624
  int64_t FPDelta = 0;
2625

2626
  // In an x86 interrupt, remove the offset we added to account for the return
2627
  // address from any stack object allocated in the caller's frame. Interrupts
2628
  // do not have a standard return address. Fixed objects in the current frame,
2629
  // such as SSE register spills, should not get this treatment.
2630
  if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&
2631
      Offset >= 0) {
2632
    Offset += getOffsetOfLocalArea();
2633
  }
2634

2635
  if (IsWin64Prologue) {
2636
    assert(!MFI.hasCalls() || (StackSize % 16) == 8);
2637

2638
    // Calculate required stack adjustment.
2639
    uint64_t FrameSize = StackSize - SlotSize;
2640
    // If required, include space for extra hidden slot for stashing base
2641
    // pointer.
2642
    if (X86FI->getRestoreBasePointer())
2643
      FrameSize += SlotSize;
2644
    uint64_t NumBytes = FrameSize - CSSize;
2645

2646
    uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
2647
    if (FI && FI == X86FI->getFAIndex())
2648
      return StackOffset::getFixed(-SEHFrameOffset);
2649

2650
    // FPDelta is the offset from the "traditional" FP location of the old base
2651
    // pointer followed by return address and the location required by the
2652
    // restricted Win64 prologue.
2653
    // Add FPDelta to all offsets below that go through the frame pointer.
2654
    FPDelta = FrameSize - SEHFrameOffset;
2655
    assert((!MFI.hasCalls() || (FPDelta % 16) == 0) &&
2656
           "FPDelta isn't aligned per the Win64 ABI!");
2657
  }
2658

2659
  if (FrameReg == TRI->getFramePtr()) {
2660
    // Skip saved EBP/RBP
2661
    Offset += SlotSize;
2662

2663
    // Account for restricted Windows prologue.
2664
    Offset += FPDelta;
2665

2666
    // Skip the RETADDR move area
2667
    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
2668
    if (TailCallReturnAddrDelta < 0)
2669
      Offset -= TailCallReturnAddrDelta;
2670

2671
    return StackOffset::getFixed(Offset);
2672
  }
2673

2674
  // FrameReg is either the stack pointer or a base pointer. But the base is
2675
  // located at the end of the statically known StackSize so the distinction
2676
  // doesn't really matter.
2677
  if (TRI->hasStackRealignment(MF) || TRI->hasBasePointer(MF))
2678
    assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
2679
  return StackOffset::getFixed(Offset + StackSize);
2680
}
2681

2682
int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
2683
                                              Register &FrameReg) const {
2684
  const MachineFrameInfo &MFI = MF.getFrameInfo();
2685
  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2686
  const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2687
  const auto it = WinEHXMMSlotInfo.find(FI);
2688

2689
  if (it == WinEHXMMSlotInfo.end())
2690
    return getFrameIndexReference(MF, FI, FrameReg).getFixed();
2691

2692
  FrameReg = TRI->getStackRegister();
2693
  return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
2694
         it->second;
2695
}
2696

2697
StackOffset
2698
X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
2699
                                           Register &FrameReg,
2700
                                           int Adjustment) const {
2701
  const MachineFrameInfo &MFI = MF.getFrameInfo();
2702
  FrameReg = TRI->getStackRegister();
2703
  return StackOffset::getFixed(MFI.getObjectOffset(FI) -
2704
                               getOffsetOfLocalArea() + Adjustment);
2705
}
2706

2707
StackOffset
2708
X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
2709
                                                 int FI, Register &FrameReg,
2710
                                                 bool IgnoreSPUpdates) const {
2711

2712
  const MachineFrameInfo &MFI = MF.getFrameInfo();
2713
  // Does not include any dynamic realign.
2714
  const uint64_t StackSize = MFI.getStackSize();
2715
  // LLVM arranges the stack as follows:
2716
  //   ...
2717
  //   ARG2
2718
  //   ARG1
2719
  //   RETADDR
2720
  //   PUSH RBP   <-- RBP points here
2721
  //   PUSH CSRs
2722
  //   ~~~~~~~    <-- possible stack realignment (non-win64)
2723
  //   ...
2724
  //   STACK OBJECTS
2725
  //   ...        <-- RSP after prologue points here
2726
  //   ~~~~~~~    <-- possible stack realignment (win64)
2727
  //
2728
  // if (hasVarSizedObjects()):
2729
  //   ...        <-- "base pointer" (ESI/RBX) points here
2730
  //   DYNAMIC ALLOCAS
2731
  //   ...        <-- RSP points here
2732
  //
2733
  // Case 1: In the simple case of no stack realignment and no dynamic
2734
  // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
2735
  // with fixed offsets from RSP.
2736
  //
2737
  // Case 2: In the case of stack realignment with no dynamic allocas, fixed
2738
  // stack objects are addressed with RBP and regular stack objects with RSP.
2739
  //
2740
  // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
2741
  // to address stack arguments for outgoing calls and nothing else. The "base
2742
  // pointer" points to local variables, and RBP points to fixed objects.
2743
  //
2744
  // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
2745
  // answer we give is relative to the SP after the prologue, and not the
2746
  // SP in the middle of the function.
2747

2748
  if (MFI.isFixedObjectIndex(FI) && TRI->hasStackRealignment(MF) &&
2749
      !STI.isTargetWin64())
2750
    return getFrameIndexReference(MF, FI, FrameReg);
2751

2752
  // If !hasReservedCallFrame the function might have SP adjustement in the
2753
  // body.  So, even though the offset is statically known, it depends on where
2754
  // we are in the function.
2755
  if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))
2756
    return getFrameIndexReference(MF, FI, FrameReg);
2757

2758
  // We don't handle tail calls, and shouldn't be seeing them either.
2759
  assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
2760
         "we don't handle this case!");
2761

2762
  // This is how the math works out:
2763
  //
2764
  //  %rsp grows (i.e. gets lower) left to right. Each box below is
2765
  //  one word (eight bytes).  Obj0 is the stack slot we're trying to
2766
  //  get to.
2767
  //
2768
  //    ----------------------------------
2769
  //    | BP | Obj0 | Obj1 | ... | ObjN |
2770
  //    ----------------------------------
2771
  //    ^    ^      ^                   ^
2772
  //    A    B      C                   E
2773
  //
2774
  // A is the incoming stack pointer.
2775
  // (B - A) is the local area offset (-8 for x86-64) [1]
2776
  // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
2777
  //
2778
  // |(E - B)| is the StackSize (absolute value, positive).  For a
2779
  // stack that grown down, this works out to be (B - E). [3]
2780
  //
2781
  // E is also the value of %rsp after stack has been set up, and we
2782
  // want (C - E) -- the value we can add to %rsp to get to Obj0.  Now
2783
  // (C - E) == (C - A) - (B - A) + (B - E)
2784
  //            { Using [1], [2] and [3] above }
2785
  //         == getObjectOffset - LocalAreaOffset + StackSize
2786

2787
  return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);
2788
}
2789

2790
bool X86FrameLowering::assignCalleeSavedSpillSlots(
2791
    MachineFunction &MF, const TargetRegisterInfo *TRI,
2792
    std::vector<CalleeSavedInfo> &CSI) const {
2793
  MachineFrameInfo &MFI = MF.getFrameInfo();
2794
  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2795

2796
  unsigned CalleeSavedFrameSize = 0;
2797
  unsigned XMMCalleeSavedFrameSize = 0;
2798
  auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2799
  int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
2800

2801
  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
2802

2803
  if (TailCallReturnAddrDelta < 0) {
2804
    // create RETURNADDR area
2805
    //   arg
2806
    //   arg
2807
    //   RETADDR
2808
    //   { ...
2809
    //     RETADDR area
2810
    //     ...
2811
    //   }
2812
    //   [EBP]
2813
    MFI.CreateFixedObject(-TailCallReturnAddrDelta,
2814
                          TailCallReturnAddrDelta - SlotSize, true);
2815
  }
2816

2817
  // Spill the BasePtr if it's used.
2818
  if (this->TRI->hasBasePointer(MF)) {
2819
    // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
2820
    if (MF.hasEHFunclets()) {
2821
      int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize));
2822
      X86FI->setHasSEHFramePtrSave(true);
2823
      X86FI->setSEHFramePtrSaveIndex(FI);
2824
    }
2825
  }
2826

2827
  if (hasFP(MF)) {
2828
    // emitPrologue always spills frame register the first thing.
2829
    SpillSlotOffset -= SlotSize;
2830
    MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2831

2832
    // The async context lives directly before the frame pointer, and we
2833
    // allocate a second slot to preserve stack alignment.
2834
    if (X86FI->hasSwiftAsyncContext()) {
2835
      SpillSlotOffset -= SlotSize;
2836
      MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2837
      SpillSlotOffset -= SlotSize;
2838
    }
2839

2840
    // Since emitPrologue and emitEpilogue will handle spilling and restoring of
2841
    // the frame register, we can delete it from CSI list and not have to worry
2842
    // about avoiding it later.
2843
    Register FPReg = TRI->getFrameRegister(MF);
2844
    for (unsigned i = 0; i < CSI.size(); ++i) {
2845
      if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) {
2846
        CSI.erase(CSI.begin() + i);
2847
        break;
2848
      }
2849
    }
2850
  }
2851

2852
  // Strategy:
2853
  // 1. Use push2 when
2854
  //       a) number of CSR > 1 if no need padding
2855
  //       b) number of CSR > 2 if need padding
2856
  // 2. When the number of CSR push is odd
2857
  //    a. Start to use push2 from the 1st push if stack is 16B aligned.
2858
  //    b. Start to use push2 from the 2nd push if stack is not 16B aligned.
2859
  // 3. When the number of CSR push is even, start to use push2 from the 1st
2860
  //    push and make the stack 16B aligned before the push
2861
  unsigned NumRegsForPush2 = 0;
2862
  if (STI.hasPush2Pop2()) {
2863
    unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
2864
      return X86::GR64RegClass.contains(I.getReg());
2865
    });
2866
    bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0);
2867
    bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1;
2868
    X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);
2869
    NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0;
2870
    if (X86FI->padForPush2Pop2()) {
2871
      SpillSlotOffset -= SlotSize;
2872
      MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2873
    }
2874
  }
2875

2876
  // Assign slots for GPRs. It increases frame size.
2877
  for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
2878
    Register Reg = I.getReg();
2879

2880
    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2881
      continue;
2882

2883
    // A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
2884
    // or only an odd number of registers in the candidates.
2885
    if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&
2886
        (SpillSlotOffset % 16 == 0 ||
2887
         X86FI->getNumCandidatesForPush2Pop2() % 2))
2888
      X86FI->addCandidateForPush2Pop2(Reg);
2889

2890
    SpillSlotOffset -= SlotSize;
2891
    CalleeSavedFrameSize += SlotSize;
2892

2893
    int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2894
    I.setFrameIdx(SlotIndex);
2895
  }
2896

2897
  // Adjust the offset of spill slot as we know the accurate callee saved frame
2898
  // size.
2899
  if (X86FI->getRestoreBasePointer()) {
2900
    SpillSlotOffset -= SlotSize;
2901
    CalleeSavedFrameSize += SlotSize;
2902

2903
    MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2904
    // TODO: saving the slot index is better?
2905
    X86FI->setRestoreBasePointer(CalleeSavedFrameSize);
2906
  }
2907
  assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 &&
2908
         "Expect even candidates for push2/pop2");
2909
  if (X86FI->getNumCandidatesForPush2Pop2())
2910
    ++NumFunctionUsingPush2Pop2;
2911
  X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
2912
  MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
2913

2914
  // Assign slots for XMMs.
2915
  for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
2916
    Register Reg = I.getReg();
2917
    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
2918
      continue;
2919

2920
    // If this is k-register make sure we lookup via the largest legal type.
2921
    MVT VT = MVT::Other;
2922
    if (X86::VK16RegClass.contains(Reg))
2923
      VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
2924

2925
    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2926
    unsigned Size = TRI->getSpillSize(*RC);
2927
    Align Alignment = TRI->getSpillAlign(*RC);
2928
    // ensure alignment
2929
    assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
2930
    SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment);
2931

2932
    // spill into slot
2933
    SpillSlotOffset -= Size;
2934
    int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
2935
    I.setFrameIdx(SlotIndex);
2936
    MFI.ensureMaxAlignment(Alignment);
2937

2938
    // Save the start offset and size of XMM in stack frame for funclets.
2939
    if (X86::VR128RegClass.contains(Reg)) {
2940
      WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
2941
      XMMCalleeSavedFrameSize += Size;
2942
    }
2943
  }
2944

2945
  return true;
2946
}
2947

2948
bool X86FrameLowering::spillCalleeSavedRegisters(
2949
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2950
    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2951
  DebugLoc DL = MBB.findDebugLoc(MI);
2952

2953
  // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
2954
  // for us, and there are no XMM CSRs on Win32.
2955
  if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
2956
    return true;
2957

2958
  // Push GPRs. It increases frame size.
2959
  const MachineFunction &MF = *MBB.getParent();
2960
  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2961
  if (X86FI->padForPush2Pop2())
2962
    emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false);
2963

2964
  // Update LiveIn of the basic block and decide whether we can add a kill flag
2965
  // to the use.
2966
  auto UpdateLiveInCheckCanKill = [&](Register Reg) {
2967
    const MachineRegisterInfo &MRI = MF.getRegInfo();
2968
    // Do not set a kill flag on values that are also marked as live-in. This
2969
    // happens with the @llvm-returnaddress intrinsic and with arguments
2970
    // passed in callee saved registers.
2971
    // Omitting the kill flags is conservatively correct even if the live-in
2972
    // is not used after all.
2973
    if (MRI.isLiveIn(Reg))
2974
      return false;
2975
    MBB.addLiveIn(Reg);
2976
    // Check if any subregister is live-in
2977
    for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)
2978
      if (MRI.isLiveIn(*AReg))
2979
        return false;
2980
    return true;
2981
  };
2982
  auto UpdateLiveInGetKillRegState = [&](Register Reg) {
2983
    return getKillRegState(UpdateLiveInCheckCanKill(Reg));
2984
  };
2985

2986
  for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {
2987
    Register Reg = RI->getReg();
2988
    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2989
      continue;
2990

2991
    if (X86FI->isCandidateForPush2Pop2(Reg)) {
2992
      Register Reg2 = (++RI)->getReg();
2993
      BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI)))
2994
          .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
2995
          .addReg(Reg2, UpdateLiveInGetKillRegState(Reg2))
2996
          .setMIFlag(MachineInstr::FrameSetup);
2997
    } else {
2998
      BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI)))
2999
          .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
3000
          .setMIFlag(MachineInstr::FrameSetup);
3001
    }
3002
  }
3003

3004
  if (X86FI->getRestoreBasePointer()) {
3005
    unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
3006
    Register BaseReg = this->TRI->getBaseRegister();
3007
    BuildMI(MBB, MI, DL, TII.get(Opc))
3008
        .addReg(BaseReg, getKillRegState(true))
3009
        .setMIFlag(MachineInstr::FrameSetup);
3010
  }
3011

3012
  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
3013
  // It can be done by spilling XMMs to stack frame.
3014
  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
3015
    Register Reg = I.getReg();
3016
    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
3017
      continue;
3018

3019
    // If this is k-register make sure we lookup via the largest legal type.
3020
    MVT VT = MVT::Other;
3021
    if (X86::VK16RegClass.contains(Reg))
3022
      VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
3023

3024
    // Add the callee-saved register as live-in. It's killed at the spill.
3025
    MBB.addLiveIn(Reg);
3026
    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
3027

3028
    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,
3029
                            Register());
3030
    --MI;
3031
    MI->setFlag(MachineInstr::FrameSetup);
3032
    ++MI;
3033
  }
3034

3035
  return true;
3036
}
3037

3038
void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
3039
                                               MachineBasicBlock::iterator MBBI,
3040
                                               MachineInstr *CatchRet) const {
3041
  // SEH shouldn't use catchret.
3042
  assert(!isAsynchronousEHPersonality(classifyEHPersonality(
3043
             MBB.getParent()->getFunction().getPersonalityFn())) &&
3044
         "SEH should not use CATCHRET");
3045
  const DebugLoc &DL = CatchRet->getDebugLoc();
3046
  MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();
3047

3048
  // Fill EAX/RAX with the address of the target block.
3049
  if (STI.is64Bit()) {
3050
    // LEA64r CatchRetTarget(%rip), %rax
3051
    BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)
3052
        .addReg(X86::RIP)
3053
        .addImm(0)
3054
        .addReg(0)
3055
        .addMBB(CatchRetTarget)
3056
        .addReg(0);
3057
  } else {
3058
    // MOV32ri $CatchRetTarget, %eax
3059
    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
3060
        .addMBB(CatchRetTarget);
3061
  }
3062

3063
  // Record that we've taken the address of CatchRetTarget and no longer just
3064
  // reference it in a terminator.
3065
  CatchRetTarget->setMachineBlockAddressTaken();
3066
}
3067

3068
bool X86FrameLowering::restoreCalleeSavedRegisters(
3069
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
3070
    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
3071
  if (CSI.empty())
3072
    return false;
3073

3074
  if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
3075
    // Don't restore CSRs in 32-bit EH funclets. Matches
3076
    // spillCalleeSavedRegisters.
3077
    if (STI.is32Bit())
3078
      return true;
3079
    // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
3080
    // funclets. emitEpilogue transforms these to normal jumps.
3081
    if (MI->getOpcode() == X86::CATCHRET) {
3082
      const Function &F = MBB.getParent()->getFunction();
3083
      bool IsSEH = isAsynchronousEHPersonality(
3084
          classifyEHPersonality(F.getPersonalityFn()));
3085
      if (IsSEH)
3086
        return true;
3087
    }
3088
  }
3089

3090
  DebugLoc DL = MBB.findDebugLoc(MI);
3091

3092
  // Reload XMMs from stack frame.
3093
  for (const CalleeSavedInfo &I : CSI) {
3094
    Register Reg = I.getReg();
3095
    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
3096
      continue;
3097

3098
    // If this is k-register make sure we lookup via the largest legal type.
3099
    MVT VT = MVT::Other;
3100
    if (X86::VK16RegClass.contains(Reg))
3101
      VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
3102

3103
    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
3104
    TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI,
3105
                             Register());
3106
  }
3107

3108
  // Clear the stack slot for spill base pointer register.
3109
  MachineFunction &MF = *MBB.getParent();
3110
  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3111
  if (X86FI->getRestoreBasePointer()) {
3112
    unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
3113
    Register BaseReg = this->TRI->getBaseRegister();
3114
    BuildMI(MBB, MI, DL, TII.get(Opc), BaseReg)
3115
        .setMIFlag(MachineInstr::FrameDestroy);
3116
  }
3117

3118
  // POP GPRs.
3119
  for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {
3120
    Register Reg = I->getReg();
3121
    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
3122
      continue;
3123

3124
    if (X86FI->isCandidateForPush2Pop2(Reg))
3125
      BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg)
3126
          .addReg((++I)->getReg(), RegState::Define)
3127
          .setMIFlag(MachineInstr::FrameDestroy);
3128
    else
3129
      BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg)
3130
          .setMIFlag(MachineInstr::FrameDestroy);
3131
  }
3132
  if (X86FI->padForPush2Pop2())
3133
    emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true);
3134

3135
  return true;
3136
}
3137

3138
void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
3139
                                            BitVector &SavedRegs,
3140
                                            RegScavenger *RS) const {
3141
  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
3142

3143
  // Spill the BasePtr if it's used.
3144
  if (TRI->hasBasePointer(MF)) {
3145
    Register BasePtr = TRI->getBaseRegister();
3146
    if (STI.isTarget64BitILP32())
3147
      BasePtr = getX86SubSuperRegister(BasePtr, 64);
3148
    SavedRegs.set(BasePtr);
3149
  }
3150
}
3151

3152
static bool HasNestArgument(const MachineFunction *MF) {
3153
  const Function &F = MF->getFunction();
3154
  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
3155
       I++) {
3156
    if (I->hasNestAttr() && !I->use_empty())
3157
      return true;
3158
  }
3159
  return false;
3160
}
3161

3162
/// GetScratchRegister - Get a temp register for performing work in the
3163
/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
3164
/// and the properties of the function either one or two registers will be
3165
/// needed. Set primary to true for the first register, false for the second.
3166
static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64,
3167
                                   const MachineFunction &MF, bool Primary) {
3168
  CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();
3169

3170
  // Erlang stuff.
3171
  if (CallingConvention == CallingConv::HiPE) {
3172
    if (Is64Bit)
3173
      return Primary ? X86::R14 : X86::R13;
3174
    else
3175
      return Primary ? X86::EBX : X86::EDI;
3176
  }
3177

3178
  if (Is64Bit) {
3179
    if (IsLP64)
3180
      return Primary ? X86::R11 : X86::R12;
3181
    else
3182
      return Primary ? X86::R11D : X86::R12D;
3183
  }
3184

3185
  bool IsNested = HasNestArgument(&MF);
3186

3187
  if (CallingConvention == CallingConv::X86_FastCall ||
3188
      CallingConvention == CallingConv::Fast ||
3189
      CallingConvention == CallingConv::Tail) {
3190
    if (IsNested)
3191
      report_fatal_error("Segmented stacks does not support fastcall with "
3192
                         "nested function.");
3193
    return Primary ? X86::EAX : X86::ECX;
3194
  }
3195
  if (IsNested)
3196
    return Primary ? X86::EDX : X86::EAX;
3197
  return Primary ? X86::ECX : X86::EAX;
3198
}
3199

3200
// The stack limit in the TCB is set to this many bytes above the actual stack
3201
// limit.
3202
static const uint64_t kSplitStackAvailable = 256;
3203

3204
void X86FrameLowering::adjustForSegmentedStacks(
3205
    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
3206
  MachineFrameInfo &MFI = MF.getFrameInfo();
3207
  uint64_t StackSize;
3208
  unsigned TlsReg, TlsOffset;
3209
  DebugLoc DL;
3210

3211
  // To support shrink-wrapping we would need to insert the new blocks
3212
  // at the right place and update the branches to PrologueMBB.
3213
  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
3214

3215
  unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
3216
  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
3217
         "Scratch register is live-in");
3218

3219
  if (MF.getFunction().isVarArg())
3220
    report_fatal_error("Segmented stacks do not support vararg functions.");
3221
  if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
3222
      !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
3223
      !STI.isTargetDragonFly())
3224
    report_fatal_error("Segmented stacks not supported on this platform.");
3225

3226
  // Eventually StackSize will be calculated by a link-time pass; which will
3227
  // also decide whether checking code needs to be injected into this particular
3228
  // prologue.
3229
  StackSize = MFI.getStackSize();
3230

3231
  if (!MFI.needsSplitStackProlog())
3232
    return;
3233

3234
  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
3235
  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
3236
  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3237
  bool IsNested = false;
3238

3239
  // We need to know if the function has a nest argument only in 64 bit mode.
3240
  if (Is64Bit)
3241
    IsNested = HasNestArgument(&MF);
3242

3243
  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
3244
  // allocMBB needs to be last (terminating) instruction.
3245

3246
  for (const auto &LI : PrologueMBB.liveins()) {
3247
    allocMBB->addLiveIn(LI);
3248
    checkMBB->addLiveIn(LI);
3249
  }
3250

3251
  if (IsNested)
3252
    allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
3253

3254
  MF.push_front(allocMBB);
3255
  MF.push_front(checkMBB);
3256

3257
  // When the frame size is less than 256 we just compare the stack
3258
  // boundary directly to the value of the stack pointer, per gcc.
3259
  bool CompareStackPointer = StackSize < kSplitStackAvailable;
3260

3261
  // Read the limit off the current stacklet off the stack_guard location.
3262
  if (Is64Bit) {
3263
    if (STI.isTargetLinux()) {
3264
      TlsReg = X86::FS;
3265
      TlsOffset = IsLP64 ? 0x70 : 0x40;
3266
    } else if (STI.isTargetDarwin()) {
3267
      TlsReg = X86::GS;
3268
      TlsOffset = 0x60 + 90 * 8; // See pthread_machdep.h. Steal TLS slot 90.
3269
    } else if (STI.isTargetWin64()) {
3270
      TlsReg = X86::GS;
3271
      TlsOffset = 0x28; // pvArbitrary, reserved for application use
3272
    } else if (STI.isTargetFreeBSD()) {
3273
      TlsReg = X86::FS;
3274
      TlsOffset = 0x18;
3275
    } else if (STI.isTargetDragonFly()) {
3276
      TlsReg = X86::FS;
3277
      TlsOffset = 0x20; // use tls_tcb.tcb_segstack
3278
    } else {
3279
      report_fatal_error("Segmented stacks not supported on this platform.");
3280
    }
3281

3282
    if (CompareStackPointer)
3283
      ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
3284
    else
3285
      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r),
3286
              ScratchReg)
3287
          .addReg(X86::RSP)
3288
          .addImm(1)
3289
          .addReg(0)
3290
          .addImm(-StackSize)
3291
          .addReg(0);
3292

3293
    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm))
3294
        .addReg(ScratchReg)
3295
        .addReg(0)
3296
        .addImm(1)
3297
        .addReg(0)
3298
        .addImm(TlsOffset)
3299
        .addReg(TlsReg);
3300
  } else {
3301
    if (STI.isTargetLinux()) {
3302
      TlsReg = X86::GS;
3303
      TlsOffset = 0x30;
3304
    } else if (STI.isTargetDarwin()) {
3305
      TlsReg = X86::GS;
3306
      TlsOffset = 0x48 + 90 * 4;
3307
    } else if (STI.isTargetWin32()) {
3308
      TlsReg = X86::FS;
3309
      TlsOffset = 0x14; // pvArbitrary, reserved for application use
3310
    } else if (STI.isTargetDragonFly()) {
3311
      TlsReg = X86::FS;
3312
      TlsOffset = 0x10; // use tls_tcb.tcb_segstack
3313
    } else if (STI.isTargetFreeBSD()) {
3314
      report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
3315
    } else {
3316
      report_fatal_error("Segmented stacks not supported on this platform.");
3317
    }
3318

3319
    if (CompareStackPointer)
3320
      ScratchReg = X86::ESP;
3321
    else
3322
      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg)
3323
          .addReg(X86::ESP)
3324
          .addImm(1)
3325
          .addReg(0)
3326
          .addImm(-StackSize)
3327
          .addReg(0);
3328

3329
    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
3330
        STI.isTargetDragonFly()) {
3331
      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
3332
          .addReg(ScratchReg)
3333
          .addReg(0)
3334
          .addImm(0)
3335
          .addReg(0)
3336
          .addImm(TlsOffset)
3337
          .addReg(TlsReg);
3338
    } else if (STI.isTargetDarwin()) {
3339

3340
      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
3341
      unsigned ScratchReg2;
3342
      bool SaveScratch2;
3343
      if (CompareStackPointer) {
3344
        // The primary scratch register is available for holding the TLS offset.
3345
        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
3346
        SaveScratch2 = false;
3347
      } else {
3348
        // Need to use a second register to hold the TLS offset
3349
        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
3350

3351
        // Unfortunately, with fastcc the second scratch register may hold an
3352
        // argument.
3353
        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
3354
      }
3355

3356
      // If Scratch2 is live-in then it needs to be saved.
3357
      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
3358
             "Scratch register is live-in and not saved");
3359

3360
      if (SaveScratch2)
3361
        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
3362
            .addReg(ScratchReg2, RegState::Kill);
3363

3364
      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
3365
          .addImm(TlsOffset);
3366
      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
3367
          .addReg(ScratchReg)
3368
          .addReg(ScratchReg2)
3369
          .addImm(1)
3370
          .addReg(0)
3371
          .addImm(0)
3372
          .addReg(TlsReg);
3373

3374
      if (SaveScratch2)
3375
        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
3376
    }
3377
  }
3378

3379
  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
3380
  // It jumps to normal execution of the function body.
3381
  BuildMI(checkMBB, DL, TII.get(X86::JCC_1))
3382
      .addMBB(&PrologueMBB)
3383
      .addImm(X86::COND_A);
3384

3385
  // On 32 bit we first push the arguments size and then the frame size. On 64
3386
  // bit, we pass the stack frame size in r10 and the argument size in r11.
3387
  if (Is64Bit) {
3388
    // Functions with nested arguments use R10, so it needs to be saved across
3389
    // the call to _morestack
3390

3391
    const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
3392
    const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
3393
    const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
3394
    const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
3395

3396
    if (IsNested)
3397
      BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
3398

3399
    BuildMI(allocMBB, DL, TII.get(getMOVriOpcode(IsLP64, StackSize)), Reg10)
3400
        .addImm(StackSize);
3401
    BuildMI(allocMBB, DL,
3402
            TII.get(getMOVriOpcode(IsLP64, X86FI->getArgumentStackSize())),
3403
            Reg11)
3404
        .addImm(X86FI->getArgumentStackSize());
3405
  } else {
3406
    BuildMI(allocMBB, DL, TII.get(X86::PUSH32i))
3407
        .addImm(X86FI->getArgumentStackSize());
3408
    BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)).addImm(StackSize);
3409
  }
3410

3411
  // __morestack is in libgcc
3412
  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
3413
    // Under the large code model, we cannot assume that __morestack lives
3414
    // within 2^31 bytes of the call site, so we cannot use pc-relative
3415
    // addressing. We cannot perform the call via a temporary register,
3416
    // as the rax register may be used to store the static chain, and all
3417
    // other suitable registers may be either callee-save or used for
3418
    // parameter passing. We cannot use the stack at this point either
3419
    // because __morestack manipulates the stack directly.
3420
    //
3421
    // To avoid these issues, perform an indirect call via a read-only memory
3422
    // location containing the address.
3423
    //
3424
    // This solution is not perfect, as it assumes that the .rodata section
3425
    // is laid out within 2^31 bytes of each function body, but this seems
3426
    // to be sufficient for JIT.
3427
    // FIXME: Add retpoline support and remove the error here..
3428
    if (STI.useIndirectThunkCalls())
3429
      report_fatal_error("Emitting morestack calls on 64-bit with the large "
3430
                         "code model and thunks not yet implemented.");
3431
    BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
3432
        .addReg(X86::RIP)
3433
        .addImm(0)
3434
        .addReg(0)
3435
        .addExternalSymbol("__morestack_addr")
3436
        .addReg(0);
3437
  } else {
3438
    if (Is64Bit)
3439
      BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
3440
          .addExternalSymbol("__morestack");
3441
    else
3442
      BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
3443
          .addExternalSymbol("__morestack");
3444
  }
3445

3446
  if (IsNested)
3447
    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
3448
  else
3449
    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
3450

3451
  allocMBB->addSuccessor(&PrologueMBB);
3452

3453
  checkMBB->addSuccessor(allocMBB, BranchProbability::getZero());
3454
  checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne());
3455

3456
#ifdef EXPENSIVE_CHECKS
3457
  MF.verify();
3458
#endif
3459
}
3460

3461
/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
3462
/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
3463
/// to fields it needs, through a named metadata node "hipe.literals" containing
3464
/// name-value pairs.
3465
static unsigned getHiPELiteral(NamedMDNode *HiPELiteralsMD,
3466
                               const StringRef LiteralName) {
3467
  for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
3468
    MDNode *Node = HiPELiteralsMD->getOperand(i);
3469
    if (Node->getNumOperands() != 2)
3470
      continue;
3471
    MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
3472
    ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
3473
    if (!NodeName || !NodeVal)
3474
      continue;
3475
    ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
3476
    if (ValConst && NodeName->getString() == LiteralName) {
3477
      return ValConst->getZExtValue();
3478
    }
3479
  }
3480

3481
  report_fatal_error("HiPE literal " + LiteralName +
3482
                     " required but not provided");
3483
}
3484

3485
// Return true if there are no non-ehpad successors to MBB and there are no
3486
// non-meta instructions between MBBI and MBB.end().
3487
static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
3488
                                  MachineBasicBlock::const_iterator MBBI) {
3489
  return llvm::all_of(
3490
             MBB.successors(),
3491
             [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
3492
         std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
3493
           return MI.isMetaInstruction();
3494
         });
3495
}
3496

3497
/// Erlang programs may need a special prologue to handle the stack size they
3498
/// might need at runtime. That is because Erlang/OTP does not implement a C
3499
/// stack but uses a custom implementation of hybrid stack/heap architecture.
3500
/// (for more information see Eric Stenman's Ph.D. thesis:
3501
/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
3502
///
3503
/// CheckStack:
3504
///       temp0 = sp - MaxStack
3505
///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
3506
/// OldStart:
3507
///       ...
3508
/// IncStack:
3509
///       call inc_stack   # doubles the stack space
3510
///       temp0 = sp - MaxStack
3511
///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
3512
void X86FrameLowering::adjustForHiPEPrologue(
3513
    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
3514
  MachineFrameInfo &MFI = MF.getFrameInfo();
3515
  DebugLoc DL;
3516

3517
  // To support shrink-wrapping we would need to insert the new blocks
3518
  // at the right place and update the branches to PrologueMBB.
3519
  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
3520

3521
  // HiPE-specific values
3522
  NamedMDNode *HiPELiteralsMD =
3523
      MF.getFunction().getParent()->getNamedMetadata("hipe.literals");
3524
  if (!HiPELiteralsMD)
3525
    report_fatal_error(
3526
        "Can't generate HiPE prologue without runtime parameters");
3527
  const unsigned HipeLeafWords = getHiPELiteral(
3528
      HiPELiteralsMD, Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
3529
  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
3530
  const unsigned Guaranteed = HipeLeafWords * SlotSize;
3531
  unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs
3532
                                ? MF.getFunction().arg_size() - CCRegisteredArgs
3533
                                : 0;
3534
  unsigned MaxStack = MFI.getStackSize() + CallerStkArity * SlotSize + SlotSize;
3535

3536
  assert(STI.isTargetLinux() &&
3537
         "HiPE prologue is only supported on Linux operating systems.");
3538

3539
  // Compute the largest caller's frame that is needed to fit the callees'
3540
  // frames. This 'MaxStack' is computed from:
3541
  //
3542
  // a) the fixed frame size, which is the space needed for all spilled temps,
3543
  // b) outgoing on-stack parameter areas, and
3544
  // c) the minimum stack space this function needs to make available for the
3545
  //    functions it calls (a tunable ABI property).
3546
  if (MFI.hasCalls()) {
3547
    unsigned MoreStackForCalls = 0;
3548

3549
    for (auto &MBB : MF) {
3550
      for (auto &MI : MBB) {
3551
        if (!MI.isCall())
3552
          continue;
3553

3554
        // Get callee operand.
3555
        const MachineOperand &MO = MI.getOperand(0);
3556

3557
        // Only take account of global function calls (no closures etc.).
3558
        if (!MO.isGlobal())
3559
          continue;
3560

3561
        const Function *F = dyn_cast<Function>(MO.getGlobal());
3562
        if (!F)
3563
          continue;
3564

3565
        // Do not update 'MaxStack' for primitive and built-in functions
3566
        // (encoded with names either starting with "erlang."/"bif_" or not
3567
        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
3568
        // "_", such as the BIF "suspend_0") as they are executed on another
3569
        // stack.
3570
        if (F->getName().contains("erlang.") || F->getName().contains("bif_") ||
3571
            F->getName().find_first_of("._") == StringRef::npos)
3572
          continue;
3573

3574
        unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs
3575
                                      ? F->arg_size() - CCRegisteredArgs
3576
                                      : 0;
3577
        if (HipeLeafWords - 1 > CalleeStkArity)
3578
          MoreStackForCalls =
3579
              std::max(MoreStackForCalls,
3580
                       (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
3581
      }
3582
    }
3583
    MaxStack += MoreStackForCalls;
3584
  }
3585

3586
  // If the stack frame needed is larger than the guaranteed then runtime checks
3587
  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
3588
  if (MaxStack > Guaranteed) {
3589
    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
3590
    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
3591

3592
    for (const auto &LI : PrologueMBB.liveins()) {
3593
      stackCheckMBB->addLiveIn(LI);
3594
      incStackMBB->addLiveIn(LI);
3595
    }
3596

3597
    MF.push_front(incStackMBB);
3598
    MF.push_front(stackCheckMBB);
3599

3600
    unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
3601
    unsigned LEAop, CMPop, CALLop;
3602
    SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
3603
    if (Is64Bit) {
3604
      SPReg = X86::RSP;
3605
      PReg = X86::RBP;
3606
      LEAop = X86::LEA64r;
3607
      CMPop = X86::CMP64rm;
3608
      CALLop = X86::CALL64pcrel32;
3609
    } else {
3610
      SPReg = X86::ESP;
3611
      PReg = X86::EBP;
3612
      LEAop = X86::LEA32r;
3613
      CMPop = X86::CMP32rm;
3614
      CALLop = X86::CALLpcrel32;
3615
    }
3616

3617
    ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
3618
    assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
3619
           "HiPE prologue scratch register is live-in");
3620

3621
    // Create new MBB for StackCheck:
3622
    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
3623
                 false, -MaxStack);
3624
    // SPLimitOffset is in a fixed heap location (pointed by BP).
3625
    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
3626
                 PReg, false, SPLimitOffset);
3627
    BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1))
3628
        .addMBB(&PrologueMBB)
3629
        .addImm(X86::COND_AE);
3630

3631
    // Create new MBB for IncStack:
3632
    BuildMI(incStackMBB, DL, TII.get(CALLop)).addExternalSymbol("inc_stack_0");
3633
    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
3634
                 false, -MaxStack);
3635
    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
3636
                 PReg, false, SPLimitOffset);
3637
    BuildMI(incStackMBB, DL, TII.get(X86::JCC_1))
3638
        .addMBB(incStackMBB)
3639
        .addImm(X86::COND_LE);
3640

3641
    stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
3642
    stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
3643
    incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
3644
    incStackMBB->addSuccessor(incStackMBB, {1, 100});
3645
  }
3646
#ifdef EXPENSIVE_CHECKS
3647
  MF.verify();
3648
#endif
3649
}
3650

3651
bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
3652
                                           MachineBasicBlock::iterator MBBI,
3653
                                           const DebugLoc &DL,
3654
                                           int Offset) const {
3655
  if (Offset <= 0)
3656
    return false;
3657

3658
  if (Offset % SlotSize)
3659
    return false;
3660

3661
  int NumPops = Offset / SlotSize;
3662
  // This is only worth it if we have at most 2 pops.
3663
  if (NumPops != 1 && NumPops != 2)
3664
    return false;
3665

3666
  // Handle only the trivial case where the adjustment directly follows
3667
  // a call. This is the most common one, anyway.
3668
  if (MBBI == MBB.begin())
3669
    return false;
3670
  MachineBasicBlock::iterator Prev = std::prev(MBBI);
3671
  if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
3672
    return false;
3673

3674
  unsigned Regs[2];
3675
  unsigned FoundRegs = 0;
3676

3677
  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3678
  const MachineOperand &RegMask = Prev->getOperand(1);
3679

3680
  auto &RegClass =
3681
      Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
3682
  // Try to find up to NumPops free registers.
3683
  for (auto Candidate : RegClass) {
3684
    // Poor man's liveness:
3685
    // Since we're immediately after a call, any register that is clobbered
3686
    // by the call and not defined by it can be considered dead.
3687
    if (!RegMask.clobbersPhysReg(Candidate))
3688
      continue;
3689

3690
    // Don't clobber reserved registers
3691
    if (MRI.isReserved(Candidate))
3692
      continue;
3693

3694
    bool IsDef = false;
3695
    for (const MachineOperand &MO : Prev->implicit_operands()) {
3696
      if (MO.isReg() && MO.isDef() &&
3697
          TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
3698
        IsDef = true;
3699
        break;
3700
      }
3701
    }
3702

3703
    if (IsDef)
3704
      continue;
3705

3706
    Regs[FoundRegs++] = Candidate;
3707
    if (FoundRegs == (unsigned)NumPops)
3708
      break;
3709
  }
3710

3711
  if (FoundRegs == 0)
3712
    return false;
3713

3714
  // If we found only one free register, but need two, reuse the same one twice.
3715
  while (FoundRegs < (unsigned)NumPops)
3716
    Regs[FoundRegs++] = Regs[0];
3717

3718
  for (int i = 0; i < NumPops; ++i)
3719
    BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r),
3720
            Regs[i]);
3721

3722
  return true;
3723
}
3724

3725
MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
3726
    MachineFunction &MF, MachineBasicBlock &MBB,
3727
    MachineBasicBlock::iterator I) const {
3728
  bool reserveCallFrame = hasReservedCallFrame(MF);
3729
  unsigned Opcode = I->getOpcode();
3730
  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
3731
  DebugLoc DL = I->getDebugLoc(); // copy DebugLoc as I will be erased.
3732
  uint64_t Amount = TII.getFrameSize(*I);
3733
  uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
3734
  I = MBB.erase(I);
3735
  auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
3736

3737
  // Try to avoid emitting dead SP adjustments if the block end is unreachable,
3738
  // typically because the function is marked noreturn (abort, throw,
3739
  // assert_fail, etc).
3740
  if (isDestroy && blockEndIsUnreachable(MBB, I))
3741
    return I;
3742

3743
  if (!reserveCallFrame) {
3744
    // If the stack pointer can be changed after prologue, turn the
3745
    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
3746
    // adjcallstackdown instruction into 'add ESP, <amt>'
3747

3748
    // We need to keep the stack aligned properly.  To do this, we round the
3749
    // amount of space needed for the outgoing arguments up to the next
3750
    // alignment boundary.
3751
    Amount = alignTo(Amount, getStackAlign());
3752

3753
    const Function &F = MF.getFunction();
3754
    bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
3755
    bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();
3756

3757
    // If we have any exception handlers in this function, and we adjust
3758
    // the SP before calls, we may need to indicate this to the unwinder
3759
    // using GNU_ARGS_SIZE. Note that this may be necessary even when
3760
    // Amount == 0, because the preceding function may have set a non-0
3761
    // GNU_ARGS_SIZE.
3762
    // TODO: We don't need to reset this between subsequent functions,
3763
    // if it didn't change.
3764
    bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
3765

3766
    if (HasDwarfEHHandlers && !isDestroy &&
3767
        MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
3768
      BuildCFI(MBB, InsertPos, DL,
3769
               MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
3770

3771
    if (Amount == 0)
3772
      return I;
3773

3774
    // Factor out the amount that gets handled inside the sequence
3775
    // (Pushes of argument for frame setup, callee pops for frame destroy)
3776
    Amount -= InternalAmt;
3777

3778
    // TODO: This is needed only if we require precise CFA.
3779
    // If this is a callee-pop calling convention, emit a CFA adjust for
3780
    // the amount the callee popped.
3781
    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
3782
      BuildCFI(MBB, InsertPos, DL,
3783
               MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
3784

3785
    // Add Amount to SP to destroy a frame, or subtract to setup.
3786
    int64_t StackAdjustment = isDestroy ? Amount : -Amount;
3787

3788
    if (StackAdjustment) {
3789
      // Merge with any previous or following adjustment instruction. Note: the
3790
      // instructions merged with here do not have CFI, so their stack
3791
      // adjustments do not feed into CfaAdjustment.
3792
      StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
3793
      StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
3794

3795
      if (StackAdjustment) {
3796
        if (!(F.hasMinSize() &&
3797
              adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
3798
          BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
3799
                               /*InEpilogue=*/false);
3800
      }
3801
    }
3802

3803
    if (DwarfCFI && !hasFP(MF)) {
3804
      // If we don't have FP, but need to generate unwind information,
3805
      // we need to set the correct CFA offset after the stack adjustment.
3806
      // How much we adjust the CFA offset depends on whether we're emitting
3807
      // CFI only for EH purposes or for debugging. EH only requires the CFA
3808
      // offset to be correct at each call site, while for debugging we want
3809
      // it to be more precise.
3810

3811
      int64_t CfaAdjustment = -StackAdjustment;
3812
      // TODO: When not using precise CFA, we also need to adjust for the
3813
      // InternalAmt here.
3814
      if (CfaAdjustment) {
3815
        BuildCFI(
3816
            MBB, InsertPos, DL,
3817
            MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
3818
      }
3819
    }
3820

3821
    return I;
3822
  }
3823

3824
  if (InternalAmt) {
3825
    MachineBasicBlock::iterator CI = I;
3826
    MachineBasicBlock::iterator B = MBB.begin();
3827
    while (CI != B && !std::prev(CI)->isCall())
3828
      --CI;
3829
    BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);
3830
  }
3831

3832
  return I;
3833
}
3834

3835
bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
3836
  assert(MBB.getParent() && "Block is not attached to a function!");
3837
  const MachineFunction &MF = *MBB.getParent();
3838
  if (!MBB.isLiveIn(X86::EFLAGS))
3839
    return true;
3840

3841
  // If stack probes have to loop inline or call, that will clobber EFLAGS.
3842
  // FIXME: we could allow cases that will use emitStackProbeInlineGenericBlock.
3843
  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
3844
  const X86TargetLowering &TLI = *STI.getTargetLowering();
3845
  if (TLI.hasInlineStackProbe(MF) || TLI.hasStackProbeSymbol(MF))
3846
    return false;
3847

3848
  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3849
  return !TRI->hasStackRealignment(MF) && !X86FI->hasSwiftAsyncContext();
3850
}
3851

3852
bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
3853
  assert(MBB.getParent() && "Block is not attached to a function!");
3854

3855
  // Win64 has strict requirements in terms of epilogue and we are
3856
  // not taking a chance at messing with them.
3857
  // I.e., unless this block is already an exit block, we can't use
3858
  // it as an epilogue.
3859
  if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
3860
    return false;
3861

3862
  // Swift async context epilogue has a BTR instruction that clobbers parts of
3863
  // EFLAGS.
3864
  const MachineFunction &MF = *MBB.getParent();
3865
  if (MF.getInfo<X86MachineFunctionInfo>()->hasSwiftAsyncContext())
3866
    return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
3867

3868
  if (canUseLEAForSPInEpilogue(*MBB.getParent()))
3869
    return true;
3870

3871
  // If we cannot use LEA to adjust SP, we may need to use ADD, which
3872
  // clobbers the EFLAGS. Check that we do not need to preserve it,
3873
  // otherwise, conservatively assume this is not
3874
  // safe to insert the epilogue here.
3875
  return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
3876
}
3877

3878
bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
3879
  // If we may need to emit frameless compact unwind information, give
3880
  // up as this is currently broken: PR25614.
3881
  bool CompactUnwind =
3882
      MF.getContext().getObjectFileInfo()->getCompactUnwindSection() != nullptr;
3883
  return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
3884
          !CompactUnwind) &&
3885
         // The lowering of segmented stack and HiPE only support entry
3886
         // blocks as prologue blocks: PR26107. This limitation may be
3887
         // lifted if we fix:
3888
         // - adjustForSegmentedStacks
3889
         // - adjustForHiPEPrologue
3890
         MF.getFunction().getCallingConv() != CallingConv::HiPE &&
3891
         !MF.shouldSplitStack();
3892
}
3893

3894
MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
3895
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
3896
    const DebugLoc &DL, bool RestoreSP) const {
3897
  assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
3898
  assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
3899
  assert(STI.is32Bit() && !Uses64BitFramePtr &&
3900
         "restoring EBP/ESI on non-32-bit target");
3901

3902
  MachineFunction &MF = *MBB.getParent();
3903
  Register FramePtr = TRI->getFrameRegister(MF);
3904
  Register BasePtr = TRI->getBaseRegister();
3905
  WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
3906
  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3907
  MachineFrameInfo &MFI = MF.getFrameInfo();
3908

3909
  // FIXME: Don't set FrameSetup flag in catchret case.
3910

3911
  int FI = FuncInfo.EHRegNodeFrameIndex;
3912
  int EHRegSize = MFI.getObjectSize(FI);
3913

3914
  if (RestoreSP) {
3915
    // MOV32rm -EHRegSize(%ebp), %esp
3916
    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
3917
                 X86::EBP, true, -EHRegSize)
3918
        .setMIFlag(MachineInstr::FrameSetup);
3919
  }
3920

3921
  Register UsedReg;
3922
  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
3923
  int EndOffset = -EHRegOffset - EHRegSize;
3924
  FuncInfo.EHRegNodeEndOffset = EndOffset;
3925

3926
  if (UsedReg == FramePtr) {
3927
    // ADD $offset, %ebp
3928
    unsigned ADDri = getADDriOpcode(false);
3929
    BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
3930
        .addReg(FramePtr)
3931
        .addImm(EndOffset)
3932
        .setMIFlag(MachineInstr::FrameSetup)
3933
        ->getOperand(3)
3934
        .setIsDead();
3935
    assert(EndOffset >= 0 &&
3936
           "end of registration object above normal EBP position!");
3937
  } else if (UsedReg == BasePtr) {
3938
    // LEA offset(%ebp), %esi
3939
    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
3940
                 FramePtr, false, EndOffset)
3941
        .setMIFlag(MachineInstr::FrameSetup);
3942
    // MOV32rm SavedEBPOffset(%esi), %ebp
3943
    assert(X86FI->getHasSEHFramePtrSave());
3944
    int Offset =
3945
        getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
3946
            .getFixed();
3947
    assert(UsedReg == BasePtr);
3948
    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
3949
                 UsedReg, true, Offset)
3950
        .setMIFlag(MachineInstr::FrameSetup);
3951
  } else {
3952
    llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
3953
  }
3954
  return MBBI;
3955
}
3956

3957
int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
3958
  return TRI->getSlotSize();
3959
}
3960

3961
Register
3962
X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
3963
  return StackPtr;
3964
}
3965

3966
TargetFrameLowering::DwarfFrameBase
3967
X86FrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
3968
  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
3969
  Register FrameRegister = RI->getFrameRegister(MF);
3970
  if (getInitialCFARegister(MF) == FrameRegister &&
3971
      MF.getInfo<X86MachineFunctionInfo>()->hasCFIAdjustCfa()) {
3972
    DwarfFrameBase FrameBase;
3973
    FrameBase.Kind = DwarfFrameBase::CFA;
3974
    FrameBase.Location.Offset =
3975
        -MF.getFrameInfo().getStackSize() - getInitialCFAOffset(MF);
3976
    return FrameBase;
3977
  }
3978

3979
  return DwarfFrameBase{DwarfFrameBase::Register, {FrameRegister}};
3980
}
3981

3982
namespace {
3983
// Struct used by orderFrameObjects to help sort the stack objects.
3984
struct X86FrameSortingObject {
3985
  bool IsValid = false;             // true if we care about this Object.
3986
  unsigned ObjectIndex = 0;         // Index of Object into MFI list.
3987
  unsigned ObjectSize = 0;          // Size of Object in bytes.
3988
  Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
3989
  unsigned ObjectNumUses = 0;       // Object static number of uses.
3990
};
3991

3992
// The comparison function we use for std::sort to order our local
3993
// stack symbols. The current algorithm is to use an estimated
3994
// "density". This takes into consideration the size and number of
3995
// uses each object has in order to roughly minimize code size.
3996
// So, for example, an object of size 16B that is referenced 5 times
3997
// will get higher priority than 4 4B objects referenced 1 time each.
3998
// It's not perfect and we may be able to squeeze a few more bytes out of
3999
// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
4000
// fringe end can have special consideration, given their size is less
4001
// important, etc.), but the algorithmic complexity grows too much to be
4002
// worth the extra gains we get. This gets us pretty close.
4003
// The final order leaves us with objects with highest priority going
4004
// at the end of our list.
4005
struct X86FrameSortingComparator {
4006
  inline bool operator()(const X86FrameSortingObject &A,
4007
                         const X86FrameSortingObject &B) const {
4008
    uint64_t DensityAScaled, DensityBScaled;
4009

4010
    // For consistency in our comparison, all invalid objects are placed
4011
    // at the end. This also allows us to stop walking when we hit the
4012
    // first invalid item after it's all sorted.
4013
    if (!A.IsValid)
4014
      return false;
4015
    if (!B.IsValid)
4016
      return true;
4017

4018
    // The density is calculated by doing :
4019
    //     (double)DensityA = A.ObjectNumUses / A.ObjectSize
4020
    //     (double)DensityB = B.ObjectNumUses / B.ObjectSize
4021
    // Since this approach may cause inconsistencies in
4022
    // the floating point <, >, == comparisons, depending on the floating
4023
    // point model with which the compiler was built, we're going
4024
    // to scale both sides by multiplying with
4025
    // A.ObjectSize * B.ObjectSize. This ends up factoring away
4026
    // the division and, with it, the need for any floating point
4027
    // arithmetic.
4028
    DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
4029
                     static_cast<uint64_t>(B.ObjectSize);
4030
    DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
4031
                     static_cast<uint64_t>(A.ObjectSize);
4032

4033
    // If the two densities are equal, prioritize highest alignment
4034
    // objects. This allows for similar alignment objects
4035
    // to be packed together (given the same density).
4036
    // There's room for improvement here, also, since we can pack
4037
    // similar alignment (different density) objects next to each
4038
    // other to save padding. This will also require further
4039
    // complexity/iterations, and the overall gain isn't worth it,
4040
    // in general. Something to keep in mind, though.
4041
    if (DensityAScaled == DensityBScaled)
4042
      return A.ObjectAlignment < B.ObjectAlignment;
4043

4044
    return DensityAScaled < DensityBScaled;
4045
  }
4046
};
4047
} // namespace
4048

4049
// Order the symbols in the local stack.
4050
// We want to place the local stack objects in some sort of sensible order.
4051
// The heuristic we use is to try and pack them according to static number
4052
// of uses and size of object in order to minimize code size.
4053
void X86FrameLowering::orderFrameObjects(
4054
    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
4055
  const MachineFrameInfo &MFI = MF.getFrameInfo();
4056

4057
  // Don't waste time if there's nothing to do.
4058
  if (ObjectsToAllocate.empty())
4059
    return;
4060

4061
  // Create an array of all MFI objects. We won't need all of these
4062
  // objects, but we're going to create a full array of them to make
4063
  // it easier to index into when we're counting "uses" down below.
4064
  // We want to be able to easily/cheaply access an object by simply
4065
  // indexing into it, instead of having to search for it every time.
4066
  std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());
4067

4068
  // Walk the objects we care about and mark them as such in our working
4069
  // struct.
4070
  for (auto &Obj : ObjectsToAllocate) {
4071
    SortingObjects[Obj].IsValid = true;
4072
    SortingObjects[Obj].ObjectIndex = Obj;
4073
    SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj);
4074
    // Set the size.
4075
    int ObjectSize = MFI.getObjectSize(Obj);
4076
    if (ObjectSize == 0)
4077
      // Variable size. Just use 4.
4078
      SortingObjects[Obj].ObjectSize = 4;
4079
    else
4080
      SortingObjects[Obj].ObjectSize = ObjectSize;
4081
  }
4082

4083
  // Count the number of uses for each object.
4084
  for (auto &MBB : MF) {
4085
    for (auto &MI : MBB) {
4086
      if (MI.isDebugInstr())
4087
        continue;
4088
      for (const MachineOperand &MO : MI.operands()) {
4089
        // Check to see if it's a local stack symbol.
4090
        if (!MO.isFI())
4091
          continue;
4092
        int Index = MO.getIndex();
4093
        // Check to see if it falls within our range, and is tagged
4094
        // to require ordering.
4095
        if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
4096
            SortingObjects[Index].IsValid)
4097
          SortingObjects[Index].ObjectNumUses++;
4098
      }
4099
    }
4100
  }
4101

4102
  // Sort the objects using X86FrameSortingAlgorithm (see its comment for
4103
  // info).
4104
  llvm::stable_sort(SortingObjects, X86FrameSortingComparator());
4105

4106
  // Now modify the original list to represent the final order that
4107
  // we want. The order will depend on whether we're going to access them
4108
  // from the stack pointer or the frame pointer. For SP, the list should
4109
  // end up with the END containing objects that we want with smaller offsets.
4110
  // For FP, it should be flipped.
4111
  int i = 0;
4112
  for (auto &Obj : SortingObjects) {
4113
    // All invalid items are sorted at the end, so it's safe to stop.
4114
    if (!Obj.IsValid)
4115
      break;
4116
    ObjectsToAllocate[i++] = Obj.ObjectIndex;
4117
  }
4118

4119
  // Flip it if we're accessing off of the FP.
4120
  if (!TRI->hasStackRealignment(MF) && hasFP(MF))
4121
    std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
4122
}
4123

4124
unsigned
4125
X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
4126
  // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
4127
  unsigned Offset = 16;
4128
  // RBP is immediately pushed.
4129
  Offset += SlotSize;
4130
  // All callee-saved registers are then pushed.
4131
  Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
4132
  // Every funclet allocates enough stack space for the largest outgoing call.
4133
  Offset += getWinEHFuncletFrameSize(MF);
4134
  return Offset;
4135
}
4136

4137
void X86FrameLowering::processFunctionBeforeFrameFinalized(
4138
    MachineFunction &MF, RegScavenger *RS) const {
4139
  // Mark the function as not having WinCFI. We will set it back to true in
4140
  // emitPrologue if it gets called and emits CFI.
4141
  MF.setHasWinCFI(false);
4142

4143
  // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
4144
  // aligned. The format doesn't support misaligned stack adjustments.
4145
  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
4146
    MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));
4147

4148
  // If this function isn't doing Win64-style C++ EH, we don't need to do
4149
  // anything.
4150
  if (STI.is64Bit() && MF.hasEHFunclets() &&
4151
      classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==
4152
          EHPersonality::MSVC_CXX) {
4153
    adjustFrameForMsvcCxxEh(MF);
4154
  }
4155
}
4156

4157
void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
4158
  // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
4159
  // relative to RSP after the prologue.  Find the offset of the last fixed
4160
  // object, so that we can allocate a slot immediately following it. If there
4161
  // were no fixed objects, use offset -SlotSize, which is immediately after the
4162
  // return address. Fixed objects have negative frame indices.
4163
  MachineFrameInfo &MFI = MF.getFrameInfo();
4164
  WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
4165
  int64_t MinFixedObjOffset = -SlotSize;
4166
  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
4167
    MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));
4168

4169
  for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
4170
    for (WinEHHandlerType &H : TBME.HandlerArray) {
4171
      int FrameIndex = H.CatchObj.FrameIndex;
4172
      if (FrameIndex != INT_MAX) {
4173
        // Ensure alignment.
4174
        unsigned Align = MFI.getObjectAlign(FrameIndex).value();
4175
        MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
4176
        MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
4177
        MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
4178
      }
4179
    }
4180
  }
4181

4182
  // Ensure alignment.
4183
  MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
4184
  int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
4185
  int UnwindHelpFI =
4186
      MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false);
4187
  EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
4188

4189
  // Store -2 into UnwindHelp on function entry. We have to scan forwards past
4190
  // other frame setup instructions.
4191
  MachineBasicBlock &MBB = MF.front();
4192
  auto MBBI = MBB.begin();
4193
  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
4194
    ++MBBI;
4195

4196
  DebugLoc DL = MBB.findDebugLoc(MBBI);
4197
  addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
4198
                    UnwindHelpFI)
4199
      .addImm(-2);
4200
}
4201

4202
void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(
4203
    MachineFunction &MF, RegScavenger *RS) const {
4204
  auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
4205

4206
  if (STI.is32Bit() && MF.hasEHFunclets())
4207
    restoreWinEHStackPointersInParent(MF);
4208
  // We have emitted prolog and epilog. Don't need stack pointer saving
4209
  // instruction any more.
4210
  if (MachineInstr *MI = X86FI->getStackPtrSaveMI()) {
4211
    MI->eraseFromParent();
4212
    X86FI->setStackPtrSaveMI(nullptr);
4213
  }
4214
}
4215

4216
void X86FrameLowering::restoreWinEHStackPointersInParent(
4217
    MachineFunction &MF) const {
4218
  // 32-bit functions have to restore stack pointers when control is transferred
4219
  // back to the parent function. These blocks are identified as eh pads that
4220
  // are not funclet entries.
4221
  bool IsSEH = isAsynchronousEHPersonality(
4222
      classifyEHPersonality(MF.getFunction().getPersonalityFn()));
4223
  for (MachineBasicBlock &MBB : MF) {
4224
    bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry();
4225
    if (NeedsRestore)
4226
      restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(),
4227
                                  /*RestoreSP=*/IsSEH);
4228
  }
4229
}
4230

4231
Product

Resources

Company