CoCalc -- AArch64InstrInfo.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
³⁵²⁶⁹ views
1
//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file contains the AArch64 implementation of the TargetInstrInfo class.
10
//
11
//===----------------------------------------------------------------------===//
12

13
#include "AArch64InstrInfo.h"
14
#include "AArch64ExpandImm.h"
15
#include "AArch64FrameLowering.h"
16
#include "AArch64MachineFunctionInfo.h"
17
#include "AArch64PointerAuth.h"
18
#include "AArch64Subtarget.h"
19
#include "MCTargetDesc/AArch64AddressingModes.h"
20
#include "MCTargetDesc/AArch64MCTargetDesc.h"
21
#include "Utils/AArch64BaseInfo.h"
22
#include "llvm/ADT/ArrayRef.h"
23
#include "llvm/ADT/STLExtras.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/CodeGen/LivePhysRegs.h"
26
#include "llvm/CodeGen/MachineBasicBlock.h"
27
#include "llvm/CodeGen/MachineCombinerPattern.h"
28
#include "llvm/CodeGen/MachineFrameInfo.h"
29
#include "llvm/CodeGen/MachineFunction.h"
30
#include "llvm/CodeGen/MachineInstr.h"
31
#include "llvm/CodeGen/MachineInstrBuilder.h"
32
#include "llvm/CodeGen/MachineMemOperand.h"
33
#include "llvm/CodeGen/MachineModuleInfo.h"
34
#include "llvm/CodeGen/MachineOperand.h"
35
#include "llvm/CodeGen/MachineRegisterInfo.h"
36
#include "llvm/CodeGen/RegisterScavenging.h"
37
#include "llvm/CodeGen/StackMaps.h"
38
#include "llvm/CodeGen/TargetRegisterInfo.h"
39
#include "llvm/CodeGen/TargetSubtargetInfo.h"
40
#include "llvm/IR/DebugInfoMetadata.h"
41
#include "llvm/IR/DebugLoc.h"
42
#include "llvm/IR/GlobalValue.h"
43
#include "llvm/IR/Module.h"
44
#include "llvm/MC/MCAsmInfo.h"
45
#include "llvm/MC/MCInst.h"
46
#include "llvm/MC/MCInstBuilder.h"
47
#include "llvm/MC/MCInstrDesc.h"
48
#include "llvm/Support/Casting.h"
49
#include "llvm/Support/CodeGen.h"
50
#include "llvm/Support/CommandLine.h"
51
#include "llvm/Support/ErrorHandling.h"
52
#include "llvm/Support/LEB128.h"
53
#include "llvm/Support/MathExtras.h"
54
#include "llvm/Target/TargetMachine.h"
55
#include "llvm/Target/TargetOptions.h"
56
#include <cassert>
57
#include <cstdint>
58
#include <iterator>
59
#include <utility>
60

61
using namespace llvm;
62

63
#define GET_INSTRINFO_CTOR_DTOR
64
#include "AArch64GenInstrInfo.inc"
65

66
static cl::opt<unsigned> TBZDisplacementBits(
67
    "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
68
    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
69

70
static cl::opt<unsigned> CBZDisplacementBits(
71
    "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
72
    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
73

74
static cl::opt<unsigned>
75
    BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
76
                        cl::desc("Restrict range of Bcc instructions (DEBUG)"));
77

78
static cl::opt<unsigned>
79
    BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
80
                      cl::desc("Restrict range of B instructions (DEBUG)"));
81

82
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
83
    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
84
                          AArch64::CATCHRET),
85
      RI(STI.getTargetTriple()), Subtarget(STI) {}
86

87
/// GetInstSize - Return the number of bytes of code the specified
88
/// instruction may be.  This returns the maximum number of bytes.
89
unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
90
  const MachineBasicBlock &MBB = *MI.getParent();
91
  const MachineFunction *MF = MBB.getParent();
92
  const Function &F = MF->getFunction();
93
  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
94

95
  {
96
    auto Op = MI.getOpcode();
97
    if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
98
      return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
99
  }
100

101
  // Meta-instructions emit no code.
102
  if (MI.isMetaInstruction())
103
    return 0;
104

105
  // FIXME: We currently only handle pseudoinstructions that don't get expanded
106
  //        before the assembly printer.
107
  unsigned NumBytes = 0;
108
  const MCInstrDesc &Desc = MI.getDesc();
109

110
  // Size should be preferably set in
111
  // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
112
  // Specific cases handle instructions of variable sizes
113
  switch (Desc.getOpcode()) {
114
  default:
115
    if (Desc.getSize())
116
      return Desc.getSize();
117

118
    // Anything not explicitly designated otherwise (i.e. pseudo-instructions
119
    // with fixed constant size but not specified in .td file) is a normal
120
    // 4-byte insn.
121
    NumBytes = 4;
122
    break;
123
  case TargetOpcode::STACKMAP:
124
    // The upper bound for a stackmap intrinsic is the full length of its shadow
125
    NumBytes = StackMapOpers(&MI).getNumPatchBytes();
126
    assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
127
    break;
128
  case TargetOpcode::PATCHPOINT:
129
    // The size of the patchpoint intrinsic is the number of bytes requested
130
    NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
131
    assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
132
    break;
133
  case TargetOpcode::STATEPOINT:
134
    NumBytes = StatepointOpers(&MI).getNumPatchBytes();
135
    assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
136
    // No patch bytes means a normal call inst is emitted
137
    if (NumBytes == 0)
138
      NumBytes = 4;
139
    break;
140
  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
141
    // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
142
    // instructions are expanded to the specified number of NOPs. Otherwise,
143
    // they are expanded to 36-byte XRay sleds.
144
    NumBytes =
145
        F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
146
    break;
147
  case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
148
  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
149
    // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
150
    NumBytes = 36;
151
    break;
152
  case TargetOpcode::PATCHABLE_EVENT_CALL:
153
    // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
154
    NumBytes = 24;
155
    break;
156

157
  case AArch64::SPACE:
158
    NumBytes = MI.getOperand(1).getImm();
159
    break;
160
  case TargetOpcode::BUNDLE:
161
    NumBytes = getInstBundleLength(MI);
162
    break;
163
  }
164

165
  return NumBytes;
166
}
167

168
unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
169
  unsigned Size = 0;
170
  MachineBasicBlock::const_instr_iterator I = MI.getIterator();
171
  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
172
  while (++I != E && I->isInsideBundle()) {
173
    assert(!I->isBundle() && "No nested bundle!");
174
    Size += getInstSizeInBytes(*I);
175
  }
176
  return Size;
177
}
178

179
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
180
                            SmallVectorImpl<MachineOperand> &Cond) {
181
  // Block ends with fall-through condbranch.
182
  switch (LastInst->getOpcode()) {
183
  default:
184
    llvm_unreachable("Unknown branch instruction?");
185
  case AArch64::Bcc:
186
    Target = LastInst->getOperand(1).getMBB();
187
    Cond.push_back(LastInst->getOperand(0));
188
    break;
189
  case AArch64::CBZW:
190
  case AArch64::CBZX:
191
  case AArch64::CBNZW:
192
  case AArch64::CBNZX:
193
    Target = LastInst->getOperand(1).getMBB();
194
    Cond.push_back(MachineOperand::CreateImm(-1));
195
    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
196
    Cond.push_back(LastInst->getOperand(0));
197
    break;
198
  case AArch64::TBZW:
199
  case AArch64::TBZX:
200
  case AArch64::TBNZW:
201
  case AArch64::TBNZX:
202
    Target = LastInst->getOperand(2).getMBB();
203
    Cond.push_back(MachineOperand::CreateImm(-1));
204
    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
205
    Cond.push_back(LastInst->getOperand(0));
206
    Cond.push_back(LastInst->getOperand(1));
207
  }
208
}
209

210
static unsigned getBranchDisplacementBits(unsigned Opc) {
211
  switch (Opc) {
212
  default:
213
    llvm_unreachable("unexpected opcode!");
214
  case AArch64::B:
215
    return BDisplacementBits;
216
  case AArch64::TBNZW:
217
  case AArch64::TBZW:
218
  case AArch64::TBNZX:
219
  case AArch64::TBZX:
220
    return TBZDisplacementBits;
221
  case AArch64::CBNZW:
222
  case AArch64::CBZW:
223
  case AArch64::CBNZX:
224
  case AArch64::CBZX:
225
    return CBZDisplacementBits;
226
  case AArch64::Bcc:
227
    return BCCDisplacementBits;
228
  }
229
}
230

231
bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
232
                                             int64_t BrOffset) const {
233
  unsigned Bits = getBranchDisplacementBits(BranchOp);
234
  assert(Bits >= 3 && "max branch displacement must be enough to jump"
235
                      "over conditional branch expansion");
236
  return isIntN(Bits, BrOffset / 4);
237
}
238

239
MachineBasicBlock *
240
AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
241
  switch (MI.getOpcode()) {
242
  default:
243
    llvm_unreachable("unexpected opcode!");
244
  case AArch64::B:
245
    return MI.getOperand(0).getMBB();
246
  case AArch64::TBZW:
247
  case AArch64::TBNZW:
248
  case AArch64::TBZX:
249
  case AArch64::TBNZX:
250
    return MI.getOperand(2).getMBB();
251
  case AArch64::CBZW:
252
  case AArch64::CBNZW:
253
  case AArch64::CBZX:
254
  case AArch64::CBNZX:
255
  case AArch64::Bcc:
256
    return MI.getOperand(1).getMBB();
257
  }
258
}
259

260
void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
261
                                            MachineBasicBlock &NewDestBB,
262
                                            MachineBasicBlock &RestoreBB,
263
                                            const DebugLoc &DL,
264
                                            int64_t BrOffset,
265
                                            RegScavenger *RS) const {
266
  assert(RS && "RegScavenger required for long branching");
267
  assert(MBB.empty() &&
268
         "new block should be inserted for expanding unconditional branch");
269
  assert(MBB.pred_size() == 1);
270
  assert(RestoreBB.empty() &&
271
         "restore block should be inserted for restoring clobbered registers");
272

273
  auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
274
    // Offsets outside of the signed 33-bit range are not supported for ADRP +
275
    // ADD.
276
    if (!isInt<33>(BrOffset))
277
      report_fatal_error(
278
          "Branch offsets outside of the signed 33-bit range not supported");
279

280
    BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
281
        .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
282
    BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
283
        .addReg(Reg)
284
        .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
285
        .addImm(0);
286
    BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
287
  };
288

289
  RS->enterBasicBlockEnd(MBB);
290
  // If X16 is unused, we can rely on the linker to insert a range extension
291
  // thunk if NewDestBB is out of range of a single B instruction.
292
  constexpr Register Reg = AArch64::X16;
293
  if (!RS->isRegUsed(Reg)) {
294
    insertUnconditionalBranch(MBB, &NewDestBB, DL);
295
    RS->setRegUsed(Reg);
296
    return;
297
  }
298

299
  // If there's a free register and it's worth inflating the code size,
300
  // manually insert the indirect branch.
301
  Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
302
  if (Scavenged != AArch64::NoRegister &&
303
      MBB.getSectionID() == MBBSectionID::ColdSectionID) {
304
    buildIndirectBranch(Scavenged, NewDestBB);
305
    RS->setRegUsed(Scavenged);
306
    return;
307
  }
308

309
  // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
310
  // with red zones.
311
  AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
312
  if (!AFI || AFI->hasRedZone().value_or(true))
313
    report_fatal_error(
314
        "Unable to insert indirect branch inside function that has red zone");
315

316
  // Otherwise, spill X16 and defer range extension to the linker.
317
  BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
318
      .addReg(AArch64::SP, RegState::Define)
319
      .addReg(Reg)
320
      .addReg(AArch64::SP)
321
      .addImm(-16);
322

323
  BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
324

325
  BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
326
      .addReg(AArch64::SP, RegState::Define)
327
      .addReg(Reg, RegState::Define)
328
      .addReg(AArch64::SP)
329
      .addImm(16);
330
}
331

332
// Branch analysis.
333
bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
334
                                     MachineBasicBlock *&TBB,
335
                                     MachineBasicBlock *&FBB,
336
                                     SmallVectorImpl<MachineOperand> &Cond,
337
                                     bool AllowModify) const {
338
  // If the block has no terminators, it just falls into the block after it.
339
  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
340
  if (I == MBB.end())
341
    return false;
342

343
  // Skip over SpeculationBarrierEndBB terminators
344
  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
345
      I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
346
    --I;
347
  }
348

349
  if (!isUnpredicatedTerminator(*I))
350
    return false;
351

352
  // Get the last instruction in the block.
353
  MachineInstr *LastInst = &*I;
354

355
  // If there is only one terminator instruction, process it.
356
  unsigned LastOpc = LastInst->getOpcode();
357
  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
358
    if (isUncondBranchOpcode(LastOpc)) {
359
      TBB = LastInst->getOperand(0).getMBB();
360
      return false;
361
    }
362
    if (isCondBranchOpcode(LastOpc)) {
363
      // Block ends with fall-through condbranch.
364
      parseCondBranch(LastInst, TBB, Cond);
365
      return false;
366
    }
367
    return true; // Can't handle indirect branch.
368
  }
369

370
  // Get the instruction before it if it is a terminator.
371
  MachineInstr *SecondLastInst = &*I;
372
  unsigned SecondLastOpc = SecondLastInst->getOpcode();
373

374
  // If AllowModify is true and the block ends with two or more unconditional
375
  // branches, delete all but the first unconditional branch.
376
  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
377
    while (isUncondBranchOpcode(SecondLastOpc)) {
378
      LastInst->eraseFromParent();
379
      LastInst = SecondLastInst;
380
      LastOpc = LastInst->getOpcode();
381
      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
382
        // Return now the only terminator is an unconditional branch.
383
        TBB = LastInst->getOperand(0).getMBB();
384
        return false;
385
      }
386
      SecondLastInst = &*I;
387
      SecondLastOpc = SecondLastInst->getOpcode();
388
    }
389
  }
390

391
  // If we're allowed to modify and the block ends in a unconditional branch
392
  // which could simply fallthrough, remove the branch.  (Note: This case only
393
  // matters when we can't understand the whole sequence, otherwise it's also
394
  // handled by BranchFolding.cpp.)
395
  if (AllowModify && isUncondBranchOpcode(LastOpc) &&
396
      MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
397
    LastInst->eraseFromParent();
398
    LastInst = SecondLastInst;
399
    LastOpc = LastInst->getOpcode();
400
    if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
401
      assert(!isUncondBranchOpcode(LastOpc) &&
402
             "unreachable unconditional branches removed above");
403

404
      if (isCondBranchOpcode(LastOpc)) {
405
        // Block ends with fall-through condbranch.
406
        parseCondBranch(LastInst, TBB, Cond);
407
        return false;
408
      }
409
      return true; // Can't handle indirect branch.
410
    }
411
    SecondLastInst = &*I;
412
    SecondLastOpc = SecondLastInst->getOpcode();
413
  }
414

415
  // If there are three terminators, we don't know what sort of block this is.
416
  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
417
    return true;
418

419
  // If the block ends with a B and a Bcc, handle it.
420
  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
421
    parseCondBranch(SecondLastInst, TBB, Cond);
422
    FBB = LastInst->getOperand(0).getMBB();
423
    return false;
424
  }
425

426
  // If the block ends with two unconditional branches, handle it.  The second
427
  // one is not executed, so remove it.
428
  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
429
    TBB = SecondLastInst->getOperand(0).getMBB();
430
    I = LastInst;
431
    if (AllowModify)
432
      I->eraseFromParent();
433
    return false;
434
  }
435

436
  // ...likewise if it ends with an indirect branch followed by an unconditional
437
  // branch.
438
  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
439
    I = LastInst;
440
    if (AllowModify)
441
      I->eraseFromParent();
442
    return true;
443
  }
444

445
  // Otherwise, can't handle this.
446
  return true;
447
}
448

449
bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
450
                                              MachineBranchPredicate &MBP,
451
                                              bool AllowModify) const {
452
  // For the moment, handle only a block which ends with a cb(n)zx followed by
453
  // a fallthrough.  Why this?  Because it is a common form.
454
  // TODO: Should we handle b.cc?
455

456
  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
457
  if (I == MBB.end())
458
    return true;
459

460
  // Skip over SpeculationBarrierEndBB terminators
461
  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
462
      I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
463
    --I;
464
  }
465

466
  if (!isUnpredicatedTerminator(*I))
467
    return true;
468

469
  // Get the last instruction in the block.
470
  MachineInstr *LastInst = &*I;
471
  unsigned LastOpc = LastInst->getOpcode();
472
  if (!isCondBranchOpcode(LastOpc))
473
    return true;
474

475
  switch (LastOpc) {
476
  default:
477
    return true;
478
  case AArch64::CBZW:
479
  case AArch64::CBZX:
480
  case AArch64::CBNZW:
481
  case AArch64::CBNZX:
482
    break;
483
  };
484

485
  MBP.TrueDest = LastInst->getOperand(1).getMBB();
486
  assert(MBP.TrueDest && "expected!");
487
  MBP.FalseDest = MBB.getNextNode();
488

489
  MBP.ConditionDef = nullptr;
490
  MBP.SingleUseCondition = false;
491

492
  MBP.LHS = LastInst->getOperand(0);
493
  MBP.RHS = MachineOperand::CreateImm(0);
494
  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
495
                                            : MachineBranchPredicate::PRED_EQ;
496
  return false;
497
}
498

499
bool AArch64InstrInfo::reverseBranchCondition(
500
    SmallVectorImpl<MachineOperand> &Cond) const {
501
  if (Cond[0].getImm() != -1) {
502
    // Regular Bcc
503
    AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
504
    Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
505
  } else {
506
    // Folded compare-and-branch
507
    switch (Cond[1].getImm()) {
508
    default:
509
      llvm_unreachable("Unknown conditional branch!");
510
    case AArch64::CBZW:
511
      Cond[1].setImm(AArch64::CBNZW);
512
      break;
513
    case AArch64::CBNZW:
514
      Cond[1].setImm(AArch64::CBZW);
515
      break;
516
    case AArch64::CBZX:
517
      Cond[1].setImm(AArch64::CBNZX);
518
      break;
519
    case AArch64::CBNZX:
520
      Cond[1].setImm(AArch64::CBZX);
521
      break;
522
    case AArch64::TBZW:
523
      Cond[1].setImm(AArch64::TBNZW);
524
      break;
525
    case AArch64::TBNZW:
526
      Cond[1].setImm(AArch64::TBZW);
527
      break;
528
    case AArch64::TBZX:
529
      Cond[1].setImm(AArch64::TBNZX);
530
      break;
531
    case AArch64::TBNZX:
532
      Cond[1].setImm(AArch64::TBZX);
533
      break;
534
    }
535
  }
536

537
  return false;
538
}
539

540
unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
541
                                        int *BytesRemoved) const {
542
  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
543
  if (I == MBB.end())
544
    return 0;
545

546
  if (!isUncondBranchOpcode(I->getOpcode()) &&
547
      !isCondBranchOpcode(I->getOpcode()))
548
    return 0;
549

550
  // Remove the branch.
551
  I->eraseFromParent();
552

553
  I = MBB.end();
554

555
  if (I == MBB.begin()) {
556
    if (BytesRemoved)
557
      *BytesRemoved = 4;
558
    return 1;
559
  }
560
  --I;
561
  if (!isCondBranchOpcode(I->getOpcode())) {
562
    if (BytesRemoved)
563
      *BytesRemoved = 4;
564
    return 1;
565
  }
566

567
  // Remove the branch.
568
  I->eraseFromParent();
569
  if (BytesRemoved)
570
    *BytesRemoved = 8;
571

572
  return 2;
573
}
574

575
void AArch64InstrInfo::instantiateCondBranch(
576
    MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
577
    ArrayRef<MachineOperand> Cond) const {
578
  if (Cond[0].getImm() != -1) {
579
    // Regular Bcc
580
    BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
581
  } else {
582
    // Folded compare-and-branch
583
    // Note that we use addOperand instead of addReg to keep the flags.
584
    const MachineInstrBuilder MIB =
585
        BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
586
    if (Cond.size() > 3)
587
      MIB.addImm(Cond[3].getImm());
588
    MIB.addMBB(TBB);
589
  }
590
}
591

592
unsigned AArch64InstrInfo::insertBranch(
593
    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
594
    ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
595
  // Shouldn't be a fall through.
596
  assert(TBB && "insertBranch must not be told to insert a fallthrough");
597

598
  if (!FBB) {
599
    if (Cond.empty()) // Unconditional branch?
600
      BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
601
    else
602
      instantiateCondBranch(MBB, DL, TBB, Cond);
603

604
    if (BytesAdded)
605
      *BytesAdded = 4;
606

607
    return 1;
608
  }
609

610
  // Two-way conditional branch.
611
  instantiateCondBranch(MBB, DL, TBB, Cond);
612
  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
613

614
  if (BytesAdded)
615
    *BytesAdded = 8;
616

617
  return 2;
618
}
619

620
// Find the original register that VReg is copied from.
621
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
622
  while (Register::isVirtualRegister(VReg)) {
623
    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
624
    if (!DefMI->isFullCopy())
625
      return VReg;
626
    VReg = DefMI->getOperand(1).getReg();
627
  }
628
  return VReg;
629
}
630

631
// Determine if VReg is defined by an instruction that can be folded into a
632
// csel instruction. If so, return the folded opcode, and the replacement
633
// register.
634
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
635
                                unsigned *NewVReg = nullptr) {
636
  VReg = removeCopies(MRI, VReg);
637
  if (!Register::isVirtualRegister(VReg))
638
    return 0;
639

640
  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
641
  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
642
  unsigned Opc = 0;
643
  unsigned SrcOpNum = 0;
644
  switch (DefMI->getOpcode()) {
645
  case AArch64::ADDSXri:
646
  case AArch64::ADDSWri:
647
    // if NZCV is used, do not fold.
648
    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
649
                                         true) == -1)
650
      return 0;
651
    // fall-through to ADDXri and ADDWri.
652
    [[fallthrough]];
653
  case AArch64::ADDXri:
654
  case AArch64::ADDWri:
655
    // add x, 1 -> csinc.
656
    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
657
        DefMI->getOperand(3).getImm() != 0)
658
      return 0;
659
    SrcOpNum = 1;
660
    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
661
    break;
662

663
  case AArch64::ORNXrr:
664
  case AArch64::ORNWrr: {
665
    // not x -> csinv, represented as orn dst, xzr, src.
666
    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
667
    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
668
      return 0;
669
    SrcOpNum = 2;
670
    Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
671
    break;
672
  }
673

674
  case AArch64::SUBSXrr:
675
  case AArch64::SUBSWrr:
676
    // if NZCV is used, do not fold.
677
    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
678
                                         true) == -1)
679
      return 0;
680
    // fall-through to SUBXrr and SUBWrr.
681
    [[fallthrough]];
682
  case AArch64::SUBXrr:
683
  case AArch64::SUBWrr: {
684
    // neg x -> csneg, represented as sub dst, xzr, src.
685
    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
686
    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
687
      return 0;
688
    SrcOpNum = 2;
689
    Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
690
    break;
691
  }
692
  default:
693
    return 0;
694
  }
695
  assert(Opc && SrcOpNum && "Missing parameters");
696

697
  if (NewVReg)
698
    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
699
  return Opc;
700
}
701

702
bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
703
                                       ArrayRef<MachineOperand> Cond,
704
                                       Register DstReg, Register TrueReg,
705
                                       Register FalseReg, int &CondCycles,
706
                                       int &TrueCycles,
707
                                       int &FalseCycles) const {
708
  // Check register classes.
709
  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
710
  const TargetRegisterClass *RC =
711
      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
712
  if (!RC)
713
    return false;
714

715
  // Also need to check the dest regclass, in case we're trying to optimize
716
  // something like:
717
  // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
718
  if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
719
    return false;
720

721
  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
722
  unsigned ExtraCondLat = Cond.size() != 1;
723

724
  // GPRs are handled by csel.
725
  // FIXME: Fold in x+1, -x, and ~x when applicable.
726
  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
727
      AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
728
    // Single-cycle csel, csinc, csinv, and csneg.
729
    CondCycles = 1 + ExtraCondLat;
730
    TrueCycles = FalseCycles = 1;
731
    if (canFoldIntoCSel(MRI, TrueReg))
732
      TrueCycles = 0;
733
    else if (canFoldIntoCSel(MRI, FalseReg))
734
      FalseCycles = 0;
735
    return true;
736
  }
737

738
  // Scalar floating point is handled by fcsel.
739
  // FIXME: Form fabs, fmin, and fmax when applicable.
740
  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
741
      AArch64::FPR32RegClass.hasSubClassEq(RC)) {
742
    CondCycles = 5 + ExtraCondLat;
743
    TrueCycles = FalseCycles = 2;
744
    return true;
745
  }
746

747
  // Can't do vectors.
748
  return false;
749
}
750

751
void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
752
                                    MachineBasicBlock::iterator I,
753
                                    const DebugLoc &DL, Register DstReg,
754
                                    ArrayRef<MachineOperand> Cond,
755
                                    Register TrueReg, Register FalseReg) const {
756
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
757

758
  // Parse the condition code, see parseCondBranch() above.
759
  AArch64CC::CondCode CC;
760
  switch (Cond.size()) {
761
  default:
762
    llvm_unreachable("Unknown condition opcode in Cond");
763
  case 1: // b.cc
764
    CC = AArch64CC::CondCode(Cond[0].getImm());
765
    break;
766
  case 3: { // cbz/cbnz
767
    // We must insert a compare against 0.
768
    bool Is64Bit;
769
    switch (Cond[1].getImm()) {
770
    default:
771
      llvm_unreachable("Unknown branch opcode in Cond");
772
    case AArch64::CBZW:
773
      Is64Bit = false;
774
      CC = AArch64CC::EQ;
775
      break;
776
    case AArch64::CBZX:
777
      Is64Bit = true;
778
      CC = AArch64CC::EQ;
779
      break;
780
    case AArch64::CBNZW:
781
      Is64Bit = false;
782
      CC = AArch64CC::NE;
783
      break;
784
    case AArch64::CBNZX:
785
      Is64Bit = true;
786
      CC = AArch64CC::NE;
787
      break;
788
    }
789
    Register SrcReg = Cond[2].getReg();
790
    if (Is64Bit) {
791
      // cmp reg, #0 is actually subs xzr, reg, #0.
792
      MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
793
      BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
794
          .addReg(SrcReg)
795
          .addImm(0)
796
          .addImm(0);
797
    } else {
798
      MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
799
      BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
800
          .addReg(SrcReg)
801
          .addImm(0)
802
          .addImm(0);
803
    }
804
    break;
805
  }
806
  case 4: { // tbz/tbnz
807
    // We must insert a tst instruction.
808
    switch (Cond[1].getImm()) {
809
    default:
810
      llvm_unreachable("Unknown branch opcode in Cond");
811
    case AArch64::TBZW:
812
    case AArch64::TBZX:
813
      CC = AArch64CC::EQ;
814
      break;
815
    case AArch64::TBNZW:
816
    case AArch64::TBNZX:
817
      CC = AArch64CC::NE;
818
      break;
819
    }
820
    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
821
    if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
822
      BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
823
          .addReg(Cond[2].getReg())
824
          .addImm(
825
              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
826
    else
827
      BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
828
          .addReg(Cond[2].getReg())
829
          .addImm(
830
              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
831
    break;
832
  }
833
  }
834

835
  unsigned Opc = 0;
836
  const TargetRegisterClass *RC = nullptr;
837
  bool TryFold = false;
838
  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
839
    RC = &AArch64::GPR64RegClass;
840
    Opc = AArch64::CSELXr;
841
    TryFold = true;
842
  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
843
    RC = &AArch64::GPR32RegClass;
844
    Opc = AArch64::CSELWr;
845
    TryFold = true;
846
  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
847
    RC = &AArch64::FPR64RegClass;
848
    Opc = AArch64::FCSELDrrr;
849
  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
850
    RC = &AArch64::FPR32RegClass;
851
    Opc = AArch64::FCSELSrrr;
852
  }
853
  assert(RC && "Unsupported regclass");
854

855
  // Try folding simple instructions into the csel.
856
  if (TryFold) {
857
    unsigned NewVReg = 0;
858
    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
859
    if (FoldedOpc) {
860
      // The folded opcodes csinc, csinc and csneg apply the operation to
861
      // FalseReg, so we need to invert the condition.
862
      CC = AArch64CC::getInvertedCondCode(CC);
863
      TrueReg = FalseReg;
864
    } else
865
      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
866

867
    // Fold the operation. Leave any dead instructions for DCE to clean up.
868
    if (FoldedOpc) {
869
      FalseReg = NewVReg;
870
      Opc = FoldedOpc;
871
      // The extends the live range of NewVReg.
872
      MRI.clearKillFlags(NewVReg);
873
    }
874
  }
875

876
  // Pull all virtual register into the appropriate class.
877
  MRI.constrainRegClass(TrueReg, RC);
878
  MRI.constrainRegClass(FalseReg, RC);
879

880
  // Insert the csel.
881
  BuildMI(MBB, I, DL, get(Opc), DstReg)
882
      .addReg(TrueReg)
883
      .addReg(FalseReg)
884
      .addImm(CC);
885
}
886

887
// Return true if Imm can be loaded into a register by a "cheap" sequence of
888
// instructions. For now, "cheap" means at most two instructions.
889
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
890
  if (BitSize == 32)
891
    return true;
892

893
  assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
894
  uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
895
  SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
896
  AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
897

898
  return Is.size() <= 2;
899
}
900

901
// FIXME: this implementation should be micro-architecture dependent, so a
902
// micro-architecture target hook should be introduced here in future.
903
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
904
  if (Subtarget.hasExynosCheapAsMoveHandling()) {
905
    if (isExynosCheapAsMove(MI))
906
      return true;
907
    return MI.isAsCheapAsAMove();
908
  }
909

910
  switch (MI.getOpcode()) {
911
  default:
912
    return MI.isAsCheapAsAMove();
913

914
  case AArch64::ADDWrs:
915
  case AArch64::ADDXrs:
916
  case AArch64::SUBWrs:
917
  case AArch64::SUBXrs:
918
    return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
919

920
  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
921
  // ORRXri, it is as cheap as MOV.
922
  // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
923
  case AArch64::MOVi32imm:
924
    return isCheapImmediate(MI, 32);
925
  case AArch64::MOVi64imm:
926
    return isCheapImmediate(MI, 64);
927
  }
928
}
929

930
bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
931
  switch (MI.getOpcode()) {
932
  default:
933
    return false;
934

935
  case AArch64::ADDWrs:
936
  case AArch64::ADDXrs:
937
  case AArch64::ADDSWrs:
938
  case AArch64::ADDSXrs: {
939
    unsigned Imm = MI.getOperand(3).getImm();
940
    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
941
    if (ShiftVal == 0)
942
      return true;
943
    return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
944
  }
945

946
  case AArch64::ADDWrx:
947
  case AArch64::ADDXrx:
948
  case AArch64::ADDXrx64:
949
  case AArch64::ADDSWrx:
950
  case AArch64::ADDSXrx:
951
  case AArch64::ADDSXrx64: {
952
    unsigned Imm = MI.getOperand(3).getImm();
953
    switch (AArch64_AM::getArithExtendType(Imm)) {
954
    default:
955
      return false;
956
    case AArch64_AM::UXTB:
957
    case AArch64_AM::UXTH:
958
    case AArch64_AM::UXTW:
959
    case AArch64_AM::UXTX:
960
      return AArch64_AM::getArithShiftValue(Imm) <= 4;
961
    }
962
  }
963

964
  case AArch64::SUBWrs:
965
  case AArch64::SUBSWrs: {
966
    unsigned Imm = MI.getOperand(3).getImm();
967
    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
968
    return ShiftVal == 0 ||
969
           (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
970
  }
971

972
  case AArch64::SUBXrs:
973
  case AArch64::SUBSXrs: {
974
    unsigned Imm = MI.getOperand(3).getImm();
975
    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
976
    return ShiftVal == 0 ||
977
           (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
978
  }
979

980
  case AArch64::SUBWrx:
981
  case AArch64::SUBXrx:
982
  case AArch64::SUBXrx64:
983
  case AArch64::SUBSWrx:
984
  case AArch64::SUBSXrx:
985
  case AArch64::SUBSXrx64: {
986
    unsigned Imm = MI.getOperand(3).getImm();
987
    switch (AArch64_AM::getArithExtendType(Imm)) {
988
    default:
989
      return false;
990
    case AArch64_AM::UXTB:
991
    case AArch64_AM::UXTH:
992
    case AArch64_AM::UXTW:
993
    case AArch64_AM::UXTX:
994
      return AArch64_AM::getArithShiftValue(Imm) == 0;
995
    }
996
  }
997

998
  case AArch64::LDRBBroW:
999
  case AArch64::LDRBBroX:
1000
  case AArch64::LDRBroW:
1001
  case AArch64::LDRBroX:
1002
  case AArch64::LDRDroW:
1003
  case AArch64::LDRDroX:
1004
  case AArch64::LDRHHroW:
1005
  case AArch64::LDRHHroX:
1006
  case AArch64::LDRHroW:
1007
  case AArch64::LDRHroX:
1008
  case AArch64::LDRQroW:
1009
  case AArch64::LDRQroX:
1010
  case AArch64::LDRSBWroW:
1011
  case AArch64::LDRSBWroX:
1012
  case AArch64::LDRSBXroW:
1013
  case AArch64::LDRSBXroX:
1014
  case AArch64::LDRSHWroW:
1015
  case AArch64::LDRSHWroX:
1016
  case AArch64::LDRSHXroW:
1017
  case AArch64::LDRSHXroX:
1018
  case AArch64::LDRSWroW:
1019
  case AArch64::LDRSWroX:
1020
  case AArch64::LDRSroW:
1021
  case AArch64::LDRSroX:
1022
  case AArch64::LDRWroW:
1023
  case AArch64::LDRWroX:
1024
  case AArch64::LDRXroW:
1025
  case AArch64::LDRXroX:
1026
  case AArch64::PRFMroW:
1027
  case AArch64::PRFMroX:
1028
  case AArch64::STRBBroW:
1029
  case AArch64::STRBBroX:
1030
  case AArch64::STRBroW:
1031
  case AArch64::STRBroX:
1032
  case AArch64::STRDroW:
1033
  case AArch64::STRDroX:
1034
  case AArch64::STRHHroW:
1035
  case AArch64::STRHHroX:
1036
  case AArch64::STRHroW:
1037
  case AArch64::STRHroX:
1038
  case AArch64::STRQroW:
1039
  case AArch64::STRQroX:
1040
  case AArch64::STRSroW:
1041
  case AArch64::STRSroX:
1042
  case AArch64::STRWroW:
1043
  case AArch64::STRWroX:
1044
  case AArch64::STRXroW:
1045
  case AArch64::STRXroX: {
1046
    unsigned IsSigned = MI.getOperand(3).getImm();
1047
    return !IsSigned;
1048
  }
1049
  }
1050
}
1051

1052
bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1053
  unsigned Opc = MI.getOpcode();
1054
  switch (Opc) {
1055
    default:
1056
      return false;
1057
    case AArch64::SEH_StackAlloc:
1058
    case AArch64::SEH_SaveFPLR:
1059
    case AArch64::SEH_SaveFPLR_X:
1060
    case AArch64::SEH_SaveReg:
1061
    case AArch64::SEH_SaveReg_X:
1062
    case AArch64::SEH_SaveRegP:
1063
    case AArch64::SEH_SaveRegP_X:
1064
    case AArch64::SEH_SaveFReg:
1065
    case AArch64::SEH_SaveFReg_X:
1066
    case AArch64::SEH_SaveFRegP:
1067
    case AArch64::SEH_SaveFRegP_X:
1068
    case AArch64::SEH_SetFP:
1069
    case AArch64::SEH_AddFP:
1070
    case AArch64::SEH_Nop:
1071
    case AArch64::SEH_PrologEnd:
1072
    case AArch64::SEH_EpilogStart:
1073
    case AArch64::SEH_EpilogEnd:
1074
    case AArch64::SEH_PACSignLR:
1075
    case AArch64::SEH_SaveAnyRegQP:
1076
    case AArch64::SEH_SaveAnyRegQPX:
1077
      return true;
1078
  }
1079
}
1080

1081
bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1082
                                             Register &SrcReg, Register &DstReg,
1083
                                             unsigned &SubIdx) const {
1084
  switch (MI.getOpcode()) {
1085
  default:
1086
    return false;
1087
  case AArch64::SBFMXri: // aka sxtw
1088
  case AArch64::UBFMXri: // aka uxtw
1089
    // Check for the 32 -> 64 bit extension case, these instructions can do
1090
    // much more.
1091
    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1092
      return false;
1093
    // This is a signed or unsigned 32 -> 64 bit extension.
1094
    SrcReg = MI.getOperand(1).getReg();
1095
    DstReg = MI.getOperand(0).getReg();
1096
    SubIdx = AArch64::sub_32;
1097
    return true;
1098
  }
1099
}
1100

1101
bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1102
    const MachineInstr &MIa, const MachineInstr &MIb) const {
1103
  const TargetRegisterInfo *TRI = &getRegisterInfo();
1104
  const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1105
  int64_t OffsetA = 0, OffsetB = 0;
1106
  TypeSize WidthA(0, false), WidthB(0, false);
1107
  bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1108

1109
  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1110
  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1111

1112
  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1113
      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1114
    return false;
1115

1116
  // Retrieve the base, offset from the base and width. Width
1117
  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
1118
  // base are identical, and the offset of a lower memory access +
1119
  // the width doesn't overlap the offset of a higher memory access,
1120
  // then the memory accesses are different.
1121
  // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1122
  // are assumed to have the same scale (vscale).
1123
  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1124
                                   WidthA, TRI) &&
1125
      getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1126
                                   WidthB, TRI)) {
1127
    if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1128
        OffsetAIsScalable == OffsetBIsScalable) {
1129
      int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1130
      int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1131
      TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1132
      if (LowWidth.isScalable() == OffsetAIsScalable &&
1133
          LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1134
        return true;
1135
    }
1136
  }
1137
  return false;
1138
}
1139

1140
bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1141
                                            const MachineBasicBlock *MBB,
1142
                                            const MachineFunction &MF) const {
1143
  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1144
    return true;
1145

1146
  // Do not move an instruction that can be recognized as a branch target.
1147
  if (hasBTISemantics(MI))
1148
    return true;
1149

1150
  switch (MI.getOpcode()) {
1151
  case AArch64::HINT:
1152
    // CSDB hints are scheduling barriers.
1153
    if (MI.getOperand(0).getImm() == 0x14)
1154
      return true;
1155
    break;
1156
  case AArch64::DSB:
1157
  case AArch64::ISB:
1158
    // DSB and ISB also are scheduling barriers.
1159
    return true;
1160
  case AArch64::MSRpstatesvcrImm1:
1161
    // SMSTART and SMSTOP are also scheduling barriers.
1162
    return true;
1163
  default:;
1164
  }
1165
  if (isSEHInstruction(MI))
1166
    return true;
1167
  auto Next = std::next(MI.getIterator());
1168
  return Next != MBB->end() && Next->isCFIInstruction();
1169
}
1170

1171
/// analyzeCompare - For a comparison instruction, return the source registers
1172
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1173
/// Return true if the comparison instruction can be analyzed.
1174
bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1175
                                      Register &SrcReg2, int64_t &CmpMask,
1176
                                      int64_t &CmpValue) const {
1177
  // The first operand can be a frame index where we'd normally expect a
1178
  // register.
1179
  assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1180
  if (!MI.getOperand(1).isReg())
1181
    return false;
1182

1183
  switch (MI.getOpcode()) {
1184
  default:
1185
    break;
1186
  case AArch64::PTEST_PP:
1187
  case AArch64::PTEST_PP_ANY:
1188
    SrcReg = MI.getOperand(0).getReg();
1189
    SrcReg2 = MI.getOperand(1).getReg();
1190
    // Not sure about the mask and value for now...
1191
    CmpMask = ~0;
1192
    CmpValue = 0;
1193
    return true;
1194
  case AArch64::SUBSWrr:
1195
  case AArch64::SUBSWrs:
1196
  case AArch64::SUBSWrx:
1197
  case AArch64::SUBSXrr:
1198
  case AArch64::SUBSXrs:
1199
  case AArch64::SUBSXrx:
1200
  case AArch64::ADDSWrr:
1201
  case AArch64::ADDSWrs:
1202
  case AArch64::ADDSWrx:
1203
  case AArch64::ADDSXrr:
1204
  case AArch64::ADDSXrs:
1205
  case AArch64::ADDSXrx:
1206
    // Replace SUBSWrr with SUBWrr if NZCV is not used.
1207
    SrcReg = MI.getOperand(1).getReg();
1208
    SrcReg2 = MI.getOperand(2).getReg();
1209
    CmpMask = ~0;
1210
    CmpValue = 0;
1211
    return true;
1212
  case AArch64::SUBSWri:
1213
  case AArch64::ADDSWri:
1214
  case AArch64::SUBSXri:
1215
  case AArch64::ADDSXri:
1216
    SrcReg = MI.getOperand(1).getReg();
1217
    SrcReg2 = 0;
1218
    CmpMask = ~0;
1219
    CmpValue = MI.getOperand(2).getImm();
1220
    return true;
1221
  case AArch64::ANDSWri:
1222
  case AArch64::ANDSXri:
1223
    // ANDS does not use the same encoding scheme as the others xxxS
1224
    // instructions.
1225
    SrcReg = MI.getOperand(1).getReg();
1226
    SrcReg2 = 0;
1227
    CmpMask = ~0;
1228
    CmpValue = AArch64_AM::decodeLogicalImmediate(
1229
                   MI.getOperand(2).getImm(),
1230
                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1231
    return true;
1232
  }
1233

1234
  return false;
1235
}
1236

1237
static bool UpdateOperandRegClass(MachineInstr &Instr) {
1238
  MachineBasicBlock *MBB = Instr.getParent();
1239
  assert(MBB && "Can't get MachineBasicBlock here");
1240
  MachineFunction *MF = MBB->getParent();
1241
  assert(MF && "Can't get MachineFunction here");
1242
  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1243
  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1244
  MachineRegisterInfo *MRI = &MF->getRegInfo();
1245

1246
  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1247
       ++OpIdx) {
1248
    MachineOperand &MO = Instr.getOperand(OpIdx);
1249
    const TargetRegisterClass *OpRegCstraints =
1250
        Instr.getRegClassConstraint(OpIdx, TII, TRI);
1251

1252
    // If there's no constraint, there's nothing to do.
1253
    if (!OpRegCstraints)
1254
      continue;
1255
    // If the operand is a frame index, there's nothing to do here.
1256
    // A frame index operand will resolve correctly during PEI.
1257
    if (MO.isFI())
1258
      continue;
1259

1260
    assert(MO.isReg() &&
1261
           "Operand has register constraints without being a register!");
1262

1263
    Register Reg = MO.getReg();
1264
    if (Reg.isPhysical()) {
1265
      if (!OpRegCstraints->contains(Reg))
1266
        return false;
1267
    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1268
               !MRI->constrainRegClass(Reg, OpRegCstraints))
1269
      return false;
1270
  }
1271

1272
  return true;
1273
}
1274

1275
/// Return the opcode that does not set flags when possible - otherwise
1276
/// return the original opcode. The caller is responsible to do the actual
1277
/// substitution and legality checking.
1278
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1279
  // Don't convert all compare instructions, because for some the zero register
1280
  // encoding becomes the sp register.
1281
  bool MIDefinesZeroReg = false;
1282
  if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1283
      MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1284
    MIDefinesZeroReg = true;
1285

1286
  switch (MI.getOpcode()) {
1287
  default:
1288
    return MI.getOpcode();
1289
  case AArch64::ADDSWrr:
1290
    return AArch64::ADDWrr;
1291
  case AArch64::ADDSWri:
1292
    return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1293
  case AArch64::ADDSWrs:
1294
    return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1295
  case AArch64::ADDSWrx:
1296
    return AArch64::ADDWrx;
1297
  case AArch64::ADDSXrr:
1298
    return AArch64::ADDXrr;
1299
  case AArch64::ADDSXri:
1300
    return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1301
  case AArch64::ADDSXrs:
1302
    return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1303
  case AArch64::ADDSXrx:
1304
    return AArch64::ADDXrx;
1305
  case AArch64::SUBSWrr:
1306
    return AArch64::SUBWrr;
1307
  case AArch64::SUBSWri:
1308
    return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1309
  case AArch64::SUBSWrs:
1310
    return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1311
  case AArch64::SUBSWrx:
1312
    return AArch64::SUBWrx;
1313
  case AArch64::SUBSXrr:
1314
    return AArch64::SUBXrr;
1315
  case AArch64::SUBSXri:
1316
    return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1317
  case AArch64::SUBSXrs:
1318
    return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1319
  case AArch64::SUBSXrx:
1320
    return AArch64::SUBXrx;
1321
  }
1322
}
1323

1324
enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1325

1326
/// True when condition flags are accessed (either by writing or reading)
1327
/// on the instruction trace starting at From and ending at To.
1328
///
1329
/// Note: If From and To are from different blocks it's assumed CC are accessed
1330
///       on the path.
1331
static bool areCFlagsAccessedBetweenInstrs(
1332
    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1333
    const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1334
  // Early exit if To is at the beginning of the BB.
1335
  if (To == To->getParent()->begin())
1336
    return true;
1337

1338
  // Check whether the instructions are in the same basic block
1339
  // If not, assume the condition flags might get modified somewhere.
1340
  if (To->getParent() != From->getParent())
1341
    return true;
1342

1343
  // From must be above To.
1344
  assert(std::any_of(
1345
      ++To.getReverse(), To->getParent()->rend(),
1346
      [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1347

1348
  // We iterate backward starting at \p To until we hit \p From.
1349
  for (const MachineInstr &Instr :
1350
       instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1351
    if (((AccessToCheck & AK_Write) &&
1352
         Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1353
        ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1354
      return true;
1355
  }
1356
  return false;
1357
}
1358

1359
std::optional<unsigned>
1360
AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1361
                                      MachineInstr *Pred,
1362
                                      const MachineRegisterInfo *MRI) const {
1363
  unsigned MaskOpcode = Mask->getOpcode();
1364
  unsigned PredOpcode = Pred->getOpcode();
1365
  bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1366
  bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1367

1368
  if (PredIsWhileLike) {
1369
    // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1370
    // instruction and the condition is "any" since WHILcc does an implicit
1371
    // PTEST(ALL, PG) check and PG is always a subset of ALL.
1372
    if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1373
      return PredOpcode;
1374

1375
    // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1376
    // redundant since WHILE performs an implicit PTEST with an all active
1377
    // mask.
1378
    if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1379
        getElementSizeForOpcode(MaskOpcode) ==
1380
            getElementSizeForOpcode(PredOpcode))
1381
      return PredOpcode;
1382

1383
    return {};
1384
  }
1385

1386
  if (PredIsPTestLike) {
1387
    // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1388
    // instruction that sets the flags as PTEST would and the condition is
1389
    // "any" since PG is always a subset of the governing predicate of the
1390
    // ptest-like instruction.
1391
    if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1392
      return PredOpcode;
1393

1394
    // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1395
    // the element size matches and either the PTEST_LIKE instruction uses
1396
    // the same all active mask or the condition is "any".
1397
    if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1398
        getElementSizeForOpcode(MaskOpcode) ==
1399
            getElementSizeForOpcode(PredOpcode)) {
1400
      auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1401
      if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1402
        return PredOpcode;
1403
    }
1404

1405
    // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1406
    // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1407
    // on 8-bit predicates like the PTEST.  Otherwise, for instructions like
1408
    // compare that also support 16/32/64-bit predicates, the implicit PTEST
1409
    // performed by the compare could consider fewer lanes for these element
1410
    // sizes.
1411
    //
1412
    // For example, consider
1413
    //
1414
    //   ptrue p0.b                    ; P0=1111-1111-1111-1111
1415
    //   index z0.s, #0, #1            ; Z0=<0,1,2,3>
1416
    //   index z1.s, #1, #1            ; Z1=<1,2,3,4>
1417
    //   cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
1418
    //                                 ;       ^ last active
1419
    //   ptest p0, p1.b                ; P1=0001-0001-0001-0001
1420
    //                                 ;     ^ last active
1421
    //
1422
    // where the compare generates a canonical all active 32-bit predicate
1423
    // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1424
    // active flag, whereas the PTEST instruction with the same mask doesn't.
1425
    // For PTEST_ANY this doesn't apply as the flags in this case would be
1426
    // identical regardless of element size.
1427
    auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1428
    uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1429
    if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1430
                                  PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1431
      return PredOpcode;
1432

1433
    return {};
1434
  }
1435

1436
  // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1437
  // opcode so the PTEST becomes redundant.
1438
  switch (PredOpcode) {
1439
  case AArch64::AND_PPzPP:
1440
  case AArch64::BIC_PPzPP:
1441
  case AArch64::EOR_PPzPP:
1442
  case AArch64::NAND_PPzPP:
1443
  case AArch64::NOR_PPzPP:
1444
  case AArch64::ORN_PPzPP:
1445
  case AArch64::ORR_PPzPP:
1446
  case AArch64::BRKA_PPzP:
1447
  case AArch64::BRKPA_PPzPP:
1448
  case AArch64::BRKB_PPzP:
1449
  case AArch64::BRKPB_PPzPP:
1450
  case AArch64::RDFFR_PPz: {
1451
    // Check to see if our mask is the same. If not the resulting flag bits
1452
    // may be different and we can't remove the ptest.
1453
    auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1454
    if (Mask != PredMask)
1455
      return {};
1456
    break;
1457
  }
1458
  case AArch64::BRKN_PPzP: {
1459
    // BRKN uses an all active implicit mask to set flags unlike the other
1460
    // flag-setting instructions.
1461
    // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1462
    if ((MaskOpcode != AArch64::PTRUE_B) ||
1463
        (Mask->getOperand(1).getImm() != 31))
1464
      return {};
1465
    break;
1466
  }
1467
  case AArch64::PTRUE_B:
1468
    // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1469
    break;
1470
  default:
1471
    // Bail out if we don't recognize the input
1472
    return {};
1473
  }
1474

1475
  return convertToFlagSettingOpc(PredOpcode);
1476
}
1477

1478
/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1479
/// operation which could set the flags in an identical manner
1480
bool AArch64InstrInfo::optimizePTestInstr(
1481
    MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1482
    const MachineRegisterInfo *MRI) const {
1483
  auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1484
  auto *Pred = MRI->getUniqueVRegDef(PredReg);
1485
  unsigned PredOpcode = Pred->getOpcode();
1486
  auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1487
  if (!NewOp)
1488
    return false;
1489

1490
  const TargetRegisterInfo *TRI = &getRegisterInfo();
1491

1492
  // If another instruction between Pred and PTest accesses flags, don't remove
1493
  // the ptest or update the earlier instruction to modify them.
1494
  if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1495
    return false;
1496

1497
  // If we pass all the checks, it's safe to remove the PTEST and use the flags
1498
  // as they are prior to PTEST. Sometimes this requires the tested PTEST
1499
  // operand to be replaced with an equivalent instruction that also sets the
1500
  // flags.
1501
  PTest->eraseFromParent();
1502
  if (*NewOp != PredOpcode) {
1503
    Pred->setDesc(get(*NewOp));
1504
    bool succeeded = UpdateOperandRegClass(*Pred);
1505
    (void)succeeded;
1506
    assert(succeeded && "Operands have incompatible register classes!");
1507
    Pred->addRegisterDefined(AArch64::NZCV, TRI);
1508
  }
1509

1510
  // Ensure that the flags def is live.
1511
  if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1512
    unsigned i = 0, e = Pred->getNumOperands();
1513
    for (; i != e; ++i) {
1514
      MachineOperand &MO = Pred->getOperand(i);
1515
      if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1516
        MO.setIsDead(false);
1517
        break;
1518
      }
1519
    }
1520
  }
1521
  return true;
1522
}
1523

1524
/// Try to optimize a compare instruction. A compare instruction is an
1525
/// instruction which produces AArch64::NZCV. It can be truly compare
1526
/// instruction
1527
/// when there are no uses of its destination register.
1528
///
1529
/// The following steps are tried in order:
1530
/// 1. Convert CmpInstr into an unconditional version.
1531
/// 2. Remove CmpInstr if above there is an instruction producing a needed
1532
///    condition code or an instruction which can be converted into such an
1533
///    instruction.
1534
///    Only comparison with zero is supported.
1535
bool AArch64InstrInfo::optimizeCompareInstr(
1536
    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1537
    int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1538
  assert(CmpInstr.getParent());
1539
  assert(MRI);
1540

1541
  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1542
  int DeadNZCVIdx =
1543
      CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1544
  if (DeadNZCVIdx != -1) {
1545
    if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1546
        CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1547
      CmpInstr.eraseFromParent();
1548
      return true;
1549
    }
1550
    unsigned Opc = CmpInstr.getOpcode();
1551
    unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1552
    if (NewOpc == Opc)
1553
      return false;
1554
    const MCInstrDesc &MCID = get(NewOpc);
1555
    CmpInstr.setDesc(MCID);
1556
    CmpInstr.removeOperand(DeadNZCVIdx);
1557
    bool succeeded = UpdateOperandRegClass(CmpInstr);
1558
    (void)succeeded;
1559
    assert(succeeded && "Some operands reg class are incompatible!");
1560
    return true;
1561
  }
1562

1563
  if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1564
      CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1565
    return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1566

1567
  if (SrcReg2 != 0)
1568
    return false;
1569

1570
  // CmpInstr is a Compare instruction if destination register is not used.
1571
  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1572
    return false;
1573

1574
  if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1575
    return true;
1576
  return (CmpValue == 0 || CmpValue == 1) &&
1577
         removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1578
}
1579

1580
/// Get opcode of S version of Instr.
1581
/// If Instr is S version its opcode is returned.
1582
/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1583
/// or we are not interested in it.
1584
static unsigned sForm(MachineInstr &Instr) {
1585
  switch (Instr.getOpcode()) {
1586
  default:
1587
    return AArch64::INSTRUCTION_LIST_END;
1588

1589
  case AArch64::ADDSWrr:
1590
  case AArch64::ADDSWri:
1591
  case AArch64::ADDSXrr:
1592
  case AArch64::ADDSXri:
1593
  case AArch64::SUBSWrr:
1594
  case AArch64::SUBSWri:
1595
  case AArch64::SUBSXrr:
1596
  case AArch64::SUBSXri:
1597
    return Instr.getOpcode();
1598

1599
  case AArch64::ADDWrr:
1600
    return AArch64::ADDSWrr;
1601
  case AArch64::ADDWri:
1602
    return AArch64::ADDSWri;
1603
  case AArch64::ADDXrr:
1604
    return AArch64::ADDSXrr;
1605
  case AArch64::ADDXri:
1606
    return AArch64::ADDSXri;
1607
  case AArch64::ADCWr:
1608
    return AArch64::ADCSWr;
1609
  case AArch64::ADCXr:
1610
    return AArch64::ADCSXr;
1611
  case AArch64::SUBWrr:
1612
    return AArch64::SUBSWrr;
1613
  case AArch64::SUBWri:
1614
    return AArch64::SUBSWri;
1615
  case AArch64::SUBXrr:
1616
    return AArch64::SUBSXrr;
1617
  case AArch64::SUBXri:
1618
    return AArch64::SUBSXri;
1619
  case AArch64::SBCWr:
1620
    return AArch64::SBCSWr;
1621
  case AArch64::SBCXr:
1622
    return AArch64::SBCSXr;
1623
  case AArch64::ANDWri:
1624
    return AArch64::ANDSWri;
1625
  case AArch64::ANDXri:
1626
    return AArch64::ANDSXri;
1627
  }
1628
}
1629

1630
/// Check if AArch64::NZCV should be alive in successors of MBB.
1631
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1632
  for (auto *BB : MBB->successors())
1633
    if (BB->isLiveIn(AArch64::NZCV))
1634
      return true;
1635
  return false;
1636
}
1637

1638
/// \returns The condition code operand index for \p Instr if it is a branch
1639
/// or select and -1 otherwise.
1640
static int
1641
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1642
  switch (Instr.getOpcode()) {
1643
  default:
1644
    return -1;
1645

1646
  case AArch64::Bcc: {
1647
    int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1648
    assert(Idx >= 2);
1649
    return Idx - 2;
1650
  }
1651

1652
  case AArch64::CSINVWr:
1653
  case AArch64::CSINVXr:
1654
  case AArch64::CSINCWr:
1655
  case AArch64::CSINCXr:
1656
  case AArch64::CSELWr:
1657
  case AArch64::CSELXr:
1658
  case AArch64::CSNEGWr:
1659
  case AArch64::CSNEGXr:
1660
  case AArch64::FCSELSrrr:
1661
  case AArch64::FCSELDrrr: {
1662
    int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1663
    assert(Idx >= 1);
1664
    return Idx - 1;
1665
  }
1666
  }
1667
}
1668

1669
/// Find a condition code used by the instruction.
1670
/// Returns AArch64CC::Invalid if either the instruction does not use condition
1671
/// codes or we don't optimize CmpInstr in the presence of such instructions.
1672
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1673
  int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1674
  return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1675
                          Instr.getOperand(CCIdx).getImm())
1676
                    : AArch64CC::Invalid;
1677
}
1678

1679
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1680
  assert(CC != AArch64CC::Invalid);
1681
  UsedNZCV UsedFlags;
1682
  switch (CC) {
1683
  default:
1684
    break;
1685

1686
  case AArch64CC::EQ: // Z set
1687
  case AArch64CC::NE: // Z clear
1688
    UsedFlags.Z = true;
1689
    break;
1690

1691
  case AArch64CC::HI: // Z clear and C set
1692
  case AArch64CC::LS: // Z set   or  C clear
1693
    UsedFlags.Z = true;
1694
    [[fallthrough]];
1695
  case AArch64CC::HS: // C set
1696
  case AArch64CC::LO: // C clear
1697
    UsedFlags.C = true;
1698
    break;
1699

1700
  case AArch64CC::MI: // N set
1701
  case AArch64CC::PL: // N clear
1702
    UsedFlags.N = true;
1703
    break;
1704

1705
  case AArch64CC::VS: // V set
1706
  case AArch64CC::VC: // V clear
1707
    UsedFlags.V = true;
1708
    break;
1709

1710
  case AArch64CC::GT: // Z clear, N and V the same
1711
  case AArch64CC::LE: // Z set,   N and V differ
1712
    UsedFlags.Z = true;
1713
    [[fallthrough]];
1714
  case AArch64CC::GE: // N and V the same
1715
  case AArch64CC::LT: // N and V differ
1716
    UsedFlags.N = true;
1717
    UsedFlags.V = true;
1718
    break;
1719
  }
1720
  return UsedFlags;
1721
}
1722

1723
/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1724
/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1725
/// \returns std::nullopt otherwise.
1726
///
1727
/// Collect instructions using that flags in \p CCUseInstrs if provided.
1728
std::optional<UsedNZCV>
1729
llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1730
                       const TargetRegisterInfo &TRI,
1731
                       SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1732
  MachineBasicBlock *CmpParent = CmpInstr.getParent();
1733
  if (MI.getParent() != CmpParent)
1734
    return std::nullopt;
1735

1736
  if (areCFlagsAliveInSuccessors(CmpParent))
1737
    return std::nullopt;
1738

1739
  UsedNZCV NZCVUsedAfterCmp;
1740
  for (MachineInstr &Instr : instructionsWithoutDebug(
1741
           std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1742
    if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1743
      AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1744
      if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1745
        return std::nullopt;
1746
      NZCVUsedAfterCmp |= getUsedNZCV(CC);
1747
      if (CCUseInstrs)
1748
        CCUseInstrs->push_back(&Instr);
1749
    }
1750
    if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1751
      break;
1752
  }
1753
  return NZCVUsedAfterCmp;
1754
}
1755

1756
static bool isADDSRegImm(unsigned Opcode) {
1757
  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1758
}
1759

1760
static bool isSUBSRegImm(unsigned Opcode) {
1761
  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1762
}
1763

1764
/// Check if CmpInstr can be substituted by MI.
1765
///
1766
/// CmpInstr can be substituted:
1767
/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1768
/// - and, MI and CmpInstr are from the same MachineBB
1769
/// - and, condition flags are not alive in successors of the CmpInstr parent
1770
/// - and, if MI opcode is the S form there must be no defs of flags between
1771
///        MI and CmpInstr
1772
///        or if MI opcode is not the S form there must be neither defs of flags
1773
///        nor uses of flags between MI and CmpInstr.
1774
/// - and, if C/V flags are not used after CmpInstr
1775
///        or if N flag is used but MI produces poison value if signed overflow
1776
///        occurs.
1777
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1778
                                       const TargetRegisterInfo &TRI) {
1779
  // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1780
  // that may or may not set flags.
1781
  assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1782

1783
  const unsigned CmpOpcode = CmpInstr.getOpcode();
1784
  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1785
    return false;
1786

1787
  assert((CmpInstr.getOperand(2).isImm() &&
1788
          CmpInstr.getOperand(2).getImm() == 0) &&
1789
         "Caller guarantees that CmpInstr compares with constant 0");
1790

1791
  std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1792
  if (!NZVCUsed || NZVCUsed->C)
1793
    return false;
1794

1795
  // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1796
  // '%vreg = add ...' or '%vreg = sub ...'.
1797
  // Condition flag V is used to indicate signed overflow.
1798
  // 1) MI and CmpInstr set N and V to the same value.
1799
  // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1800
  //    signed overflow occurs, so CmpInstr could still be simplified away.
1801
  if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1802
    return false;
1803

1804
  AccessKind AccessToCheck = AK_Write;
1805
  if (sForm(MI) != MI.getOpcode())
1806
    AccessToCheck = AK_All;
1807
  return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1808
}
1809

1810
/// Substitute an instruction comparing to zero with another instruction
1811
/// which produces needed condition flags.
1812
///
1813
/// Return true on success.
1814
bool AArch64InstrInfo::substituteCmpToZero(
1815
    MachineInstr &CmpInstr, unsigned SrcReg,
1816
    const MachineRegisterInfo &MRI) const {
1817
  // Get the unique definition of SrcReg.
1818
  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1819
  if (!MI)
1820
    return false;
1821

1822
  const TargetRegisterInfo &TRI = getRegisterInfo();
1823

1824
  unsigned NewOpc = sForm(*MI);
1825
  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1826
    return false;
1827

1828
  if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1829
    return false;
1830

1831
  // Update the instruction to set NZCV.
1832
  MI->setDesc(get(NewOpc));
1833
  CmpInstr.eraseFromParent();
1834
  bool succeeded = UpdateOperandRegClass(*MI);
1835
  (void)succeeded;
1836
  assert(succeeded && "Some operands reg class are incompatible!");
1837
  MI->addRegisterDefined(AArch64::NZCV, &TRI);
1838
  return true;
1839
}
1840

1841
/// \returns True if \p CmpInstr can be removed.
1842
///
1843
/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1844
/// codes used in \p CCUseInstrs must be inverted.
1845
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1846
                                 int CmpValue, const TargetRegisterInfo &TRI,
1847
                                 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1848
                                 bool &IsInvertCC) {
1849
  assert((CmpValue == 0 || CmpValue == 1) &&
1850
         "Only comparisons to 0 or 1 considered for removal!");
1851

1852
  // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1853
  unsigned MIOpc = MI.getOpcode();
1854
  if (MIOpc == AArch64::CSINCWr) {
1855
    if (MI.getOperand(1).getReg() != AArch64::WZR ||
1856
        MI.getOperand(2).getReg() != AArch64::WZR)
1857
      return false;
1858
  } else if (MIOpc == AArch64::CSINCXr) {
1859
    if (MI.getOperand(1).getReg() != AArch64::XZR ||
1860
        MI.getOperand(2).getReg() != AArch64::XZR)
1861
      return false;
1862
  } else {
1863
    return false;
1864
  }
1865
  AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1866
  if (MICC == AArch64CC::Invalid)
1867
    return false;
1868

1869
  // NZCV needs to be defined
1870
  if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1871
    return false;
1872

1873
  // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1874
  const unsigned CmpOpcode = CmpInstr.getOpcode();
1875
  bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1876
  if (CmpValue && !IsSubsRegImm)
1877
    return false;
1878
  if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1879
    return false;
1880

1881
  // MI conditions allowed: eq, ne, mi, pl
1882
  UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1883
  if (MIUsedNZCV.C || MIUsedNZCV.V)
1884
    return false;
1885

1886
  std::optional<UsedNZCV> NZCVUsedAfterCmp =
1887
      examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1888
  // Condition flags are not used in CmpInstr basic block successors and only
1889
  // Z or N flags allowed to be used after CmpInstr within its basic block
1890
  if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1891
    return false;
1892
  // Z or N flag used after CmpInstr must correspond to the flag used in MI
1893
  if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1894
      (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1895
    return false;
1896
  // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1897
  if (MIUsedNZCV.N && !CmpValue)
1898
    return false;
1899

1900
  // There must be no defs of flags between MI and CmpInstr
1901
  if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1902
    return false;
1903

1904
  // Condition code is inverted in the following cases:
1905
  // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1906
  // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1907
  IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1908
               (!CmpValue && MICC == AArch64CC::NE);
1909
  return true;
1910
}
1911

1912
/// Remove comparison in csinc-cmp sequence
1913
///
1914
/// Examples:
1915
/// 1. \code
1916
///   csinc w9, wzr, wzr, ne
1917
///   cmp   w9, #0
1918
///   b.eq
1919
///    \endcode
1920
/// to
1921
///    \code
1922
///   csinc w9, wzr, wzr, ne
1923
///   b.ne
1924
///    \endcode
1925
///
1926
/// 2. \code
1927
///   csinc x2, xzr, xzr, mi
1928
///   cmp   x2, #1
1929
///   b.pl
1930
///    \endcode
1931
/// to
1932
///    \code
1933
///   csinc x2, xzr, xzr, mi
1934
///   b.pl
1935
///    \endcode
1936
///
1937
/// \param  CmpInstr comparison instruction
1938
/// \return True when comparison removed
1939
bool AArch64InstrInfo::removeCmpToZeroOrOne(
1940
    MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1941
    const MachineRegisterInfo &MRI) const {
1942
  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1943
  if (!MI)
1944
    return false;
1945
  const TargetRegisterInfo &TRI = getRegisterInfo();
1946
  SmallVector<MachineInstr *, 4> CCUseInstrs;
1947
  bool IsInvertCC = false;
1948
  if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1949
                            IsInvertCC))
1950
    return false;
1951
  // Make transformation
1952
  CmpInstr.eraseFromParent();
1953
  if (IsInvertCC) {
1954
    // Invert condition codes in CmpInstr CC users
1955
    for (MachineInstr *CCUseInstr : CCUseInstrs) {
1956
      int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1957
      assert(Idx >= 0 && "Unexpected instruction using CC.");
1958
      MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1959
      AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1960
          static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1961
      CCOperand.setImm(CCUse);
1962
    }
1963
  }
1964
  return true;
1965
}
1966

1967
bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1968
  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1969
      MI.getOpcode() != AArch64::CATCHRET)
1970
    return false;
1971

1972
  MachineBasicBlock &MBB = *MI.getParent();
1973
  auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1974
  auto TRI = Subtarget.getRegisterInfo();
1975
  DebugLoc DL = MI.getDebugLoc();
1976

1977
  if (MI.getOpcode() == AArch64::CATCHRET) {
1978
    // Skip to the first instruction before the epilog.
1979
    const TargetInstrInfo *TII =
1980
      MBB.getParent()->getSubtarget().getInstrInfo();
1981
    MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1982
    auto MBBI = MachineBasicBlock::iterator(MI);
1983
    MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1984
    while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1985
           FirstEpilogSEH != MBB.begin())
1986
      FirstEpilogSEH = std::prev(FirstEpilogSEH);
1987
    if (FirstEpilogSEH != MBB.begin())
1988
      FirstEpilogSEH = std::next(FirstEpilogSEH);
1989
    BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1990
        .addReg(AArch64::X0, RegState::Define)
1991
        .addMBB(TargetMBB);
1992
    BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1993
        .addReg(AArch64::X0, RegState::Define)
1994
        .addReg(AArch64::X0)
1995
        .addMBB(TargetMBB)
1996
        .addImm(0);
1997
    return true;
1998
  }
1999

2000
  Register Reg = MI.getOperand(0).getReg();
2001
  Module &M = *MBB.getParent()->getFunction().getParent();
2002
  if (M.getStackProtectorGuard() == "sysreg") {
2003
    const AArch64SysReg::SysReg *SrcReg =
2004
        AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2005
    if (!SrcReg)
2006
      report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2007

2008
    // mrs xN, sysreg
2009
    BuildMI(MBB, MI, DL, get(AArch64::MRS))
2010
        .addDef(Reg, RegState::Renamable)
2011
        .addImm(SrcReg->Encoding);
2012
    int Offset = M.getStackProtectorGuardOffset();
2013
    if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2014
      // ldr xN, [xN, #offset]
2015
      BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2016
          .addDef(Reg)
2017
          .addUse(Reg, RegState::Kill)
2018
          .addImm(Offset / 8);
2019
    } else if (Offset >= -256 && Offset <= 255) {
2020
      // ldur xN, [xN, #offset]
2021
      BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2022
          .addDef(Reg)
2023
          .addUse(Reg, RegState::Kill)
2024
          .addImm(Offset);
2025
    } else if (Offset >= -4095 && Offset <= 4095) {
2026
      if (Offset > 0) {
2027
        // add xN, xN, #offset
2028
        BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2029
            .addDef(Reg)
2030
            .addUse(Reg, RegState::Kill)
2031
            .addImm(Offset)
2032
            .addImm(0);
2033
      } else {
2034
        // sub xN, xN, #offset
2035
        BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2036
            .addDef(Reg)
2037
            .addUse(Reg, RegState::Kill)
2038
            .addImm(-Offset)
2039
            .addImm(0);
2040
      }
2041
      // ldr xN, [xN]
2042
      BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2043
          .addDef(Reg)
2044
          .addUse(Reg, RegState::Kill)
2045
          .addImm(0);
2046
    } else {
2047
      // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2048
      // than 23760.
2049
      // It might be nice to use AArch64::MOVi32imm here, which would get
2050
      // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2051
      // contains the MRS result. findScratchNonCalleeSaveRegister() in
2052
      // AArch64FrameLowering might help us find such a scratch register
2053
      // though. If we failed to find a scratch register, we could emit a
2054
      // stream of add instructions to build up the immediate. Or, we could try
2055
      // to insert a AArch64::MOVi32imm before register allocation so that we
2056
      // didn't need to scavenge for a scratch register.
2057
      report_fatal_error("Unable to encode Stack Protector Guard Offset");
2058
    }
2059
    MBB.erase(MI);
2060
    return true;
2061
  }
2062

2063
  const GlobalValue *GV =
2064
      cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2065
  const TargetMachine &TM = MBB.getParent()->getTarget();
2066
  unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2067
  const unsigned char MO_NC = AArch64II::MO_NC;
2068

2069
  if ((OpFlags & AArch64II::MO_GOT) != 0) {
2070
    BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2071
        .addGlobalAddress(GV, 0, OpFlags);
2072
    if (Subtarget.isTargetILP32()) {
2073
      unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2074
      BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2075
          .addDef(Reg32, RegState::Dead)
2076
          .addUse(Reg, RegState::Kill)
2077
          .addImm(0)
2078
          .addMemOperand(*MI.memoperands_begin())
2079
          .addDef(Reg, RegState::Implicit);
2080
    } else {
2081
      BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2082
          .addReg(Reg, RegState::Kill)
2083
          .addImm(0)
2084
          .addMemOperand(*MI.memoperands_begin());
2085
    }
2086
  } else if (TM.getCodeModel() == CodeModel::Large) {
2087
    assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2088
    BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2089
        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2090
        .addImm(0);
2091
    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2092
        .addReg(Reg, RegState::Kill)
2093
        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2094
        .addImm(16);
2095
    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2096
        .addReg(Reg, RegState::Kill)
2097
        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2098
        .addImm(32);
2099
    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2100
        .addReg(Reg, RegState::Kill)
2101
        .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2102
        .addImm(48);
2103
    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2104
        .addReg(Reg, RegState::Kill)
2105
        .addImm(0)
2106
        .addMemOperand(*MI.memoperands_begin());
2107
  } else if (TM.getCodeModel() == CodeModel::Tiny) {
2108
    BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2109
        .addGlobalAddress(GV, 0, OpFlags);
2110
  } else {
2111
    BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2112
        .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2113
    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2114
    if (Subtarget.isTargetILP32()) {
2115
      unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2116
      BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2117
          .addDef(Reg32, RegState::Dead)
2118
          .addUse(Reg, RegState::Kill)
2119
          .addGlobalAddress(GV, 0, LoFlags)
2120
          .addMemOperand(*MI.memoperands_begin())
2121
          .addDef(Reg, RegState::Implicit);
2122
    } else {
2123
      BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2124
          .addReg(Reg, RegState::Kill)
2125
          .addGlobalAddress(GV, 0, LoFlags)
2126
          .addMemOperand(*MI.memoperands_begin());
2127
    }
2128
  }
2129

2130
  MBB.erase(MI);
2131

2132
  return true;
2133
}
2134

2135
// Return true if this instruction simply sets its single destination register
2136
// to zero. This is equivalent to a register rename of the zero-register.
2137
bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2138
  switch (MI.getOpcode()) {
2139
  default:
2140
    break;
2141
  case AArch64::MOVZWi:
2142
  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2143
    if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2144
      assert(MI.getDesc().getNumOperands() == 3 &&
2145
             MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2146
      return true;
2147
    }
2148
    break;
2149
  case AArch64::ANDWri: // and Rd, Rzr, #imm
2150
    return MI.getOperand(1).getReg() == AArch64::WZR;
2151
  case AArch64::ANDXri:
2152
    return MI.getOperand(1).getReg() == AArch64::XZR;
2153
  case TargetOpcode::COPY:
2154
    return MI.getOperand(1).getReg() == AArch64::WZR;
2155
  }
2156
  return false;
2157
}
2158

2159
// Return true if this instruction simply renames a general register without
2160
// modifying bits.
2161
bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2162
  switch (MI.getOpcode()) {
2163
  default:
2164
    break;
2165
  case TargetOpcode::COPY: {
2166
    // GPR32 copies will by lowered to ORRXrs
2167
    Register DstReg = MI.getOperand(0).getReg();
2168
    return (AArch64::GPR32RegClass.contains(DstReg) ||
2169
            AArch64::GPR64RegClass.contains(DstReg));
2170
  }
2171
  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2172
    if (MI.getOperand(1).getReg() == AArch64::XZR) {
2173
      assert(MI.getDesc().getNumOperands() == 4 &&
2174
             MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2175
      return true;
2176
    }
2177
    break;
2178
  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2179
    if (MI.getOperand(2).getImm() == 0) {
2180
      assert(MI.getDesc().getNumOperands() == 4 &&
2181
             MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2182
      return true;
2183
    }
2184
    break;
2185
  }
2186
  return false;
2187
}
2188

2189
// Return true if this instruction simply renames a general register without
2190
// modifying bits.
2191
bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2192
  switch (MI.getOpcode()) {
2193
  default:
2194
    break;
2195
  case TargetOpcode::COPY: {
2196
    Register DstReg = MI.getOperand(0).getReg();
2197
    return AArch64::FPR128RegClass.contains(DstReg);
2198
  }
2199
  case AArch64::ORRv16i8:
2200
    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2201
      assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2202
             "invalid ORRv16i8 operands");
2203
      return true;
2204
    }
2205
    break;
2206
  }
2207
  return false;
2208
}
2209

2210
Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2211
                                               int &FrameIndex) const {
2212
  switch (MI.getOpcode()) {
2213
  default:
2214
    break;
2215
  case AArch64::LDRWui:
2216
  case AArch64::LDRXui:
2217
  case AArch64::LDRBui:
2218
  case AArch64::LDRHui:
2219
  case AArch64::LDRSui:
2220
  case AArch64::LDRDui:
2221
  case AArch64::LDRQui:
2222
  case AArch64::LDR_PXI:
2223
    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2224
        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2225
      FrameIndex = MI.getOperand(1).getIndex();
2226
      return MI.getOperand(0).getReg();
2227
    }
2228
    break;
2229
  }
2230

2231
  return 0;
2232
}
2233

2234
Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2235
                                              int &FrameIndex) const {
2236
  switch (MI.getOpcode()) {
2237
  default:
2238
    break;
2239
  case AArch64::STRWui:
2240
  case AArch64::STRXui:
2241
  case AArch64::STRBui:
2242
  case AArch64::STRHui:
2243
  case AArch64::STRSui:
2244
  case AArch64::STRDui:
2245
  case AArch64::STRQui:
2246
  case AArch64::STR_PXI:
2247
    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2248
        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2249
      FrameIndex = MI.getOperand(1).getIndex();
2250
      return MI.getOperand(0).getReg();
2251
    }
2252
    break;
2253
  }
2254
  return 0;
2255
}
2256

2257
/// Check all MachineMemOperands for a hint to suppress pairing.
2258
bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2259
  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2260
    return MMO->getFlags() & MOSuppressPair;
2261
  });
2262
}
2263

2264
/// Set a flag on the first MachineMemOperand to suppress pairing.
2265
void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2266
  if (MI.memoperands_empty())
2267
    return;
2268
  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2269
}
2270

2271
/// Check all MachineMemOperands for a hint that the load/store is strided.
2272
bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2273
  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2274
    return MMO->getFlags() & MOStridedAccess;
2275
  });
2276
}
2277

2278
bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2279
  switch (Opc) {
2280
  default:
2281
    return false;
2282
  case AArch64::STURSi:
2283
  case AArch64::STRSpre:
2284
  case AArch64::STURDi:
2285
  case AArch64::STRDpre:
2286
  case AArch64::STURQi:
2287
  case AArch64::STRQpre:
2288
  case AArch64::STURBBi:
2289
  case AArch64::STURHHi:
2290
  case AArch64::STURWi:
2291
  case AArch64::STRWpre:
2292
  case AArch64::STURXi:
2293
  case AArch64::STRXpre:
2294
  case AArch64::LDURSi:
2295
  case AArch64::LDRSpre:
2296
  case AArch64::LDURDi:
2297
  case AArch64::LDRDpre:
2298
  case AArch64::LDURQi:
2299
  case AArch64::LDRQpre:
2300
  case AArch64::LDURWi:
2301
  case AArch64::LDRWpre:
2302
  case AArch64::LDURXi:
2303
  case AArch64::LDRXpre:
2304
  case AArch64::LDRSWpre:
2305
  case AArch64::LDURSWi:
2306
  case AArch64::LDURHHi:
2307
  case AArch64::LDURBBi:
2308
  case AArch64::LDURSBWi:
2309
  case AArch64::LDURSHWi:
2310
    return true;
2311
  }
2312
}
2313

2314
std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2315
  switch (Opc) {
2316
  default: return {};
2317
  case AArch64::PRFMui: return AArch64::PRFUMi;
2318
  case AArch64::LDRXui: return AArch64::LDURXi;
2319
  case AArch64::LDRWui: return AArch64::LDURWi;
2320
  case AArch64::LDRBui: return AArch64::LDURBi;
2321
  case AArch64::LDRHui: return AArch64::LDURHi;
2322
  case AArch64::LDRSui: return AArch64::LDURSi;
2323
  case AArch64::LDRDui: return AArch64::LDURDi;
2324
  case AArch64::LDRQui: return AArch64::LDURQi;
2325
  case AArch64::LDRBBui: return AArch64::LDURBBi;
2326
  case AArch64::LDRHHui: return AArch64::LDURHHi;
2327
  case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2328
  case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2329
  case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2330
  case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2331
  case AArch64::LDRSWui: return AArch64::LDURSWi;
2332
  case AArch64::STRXui: return AArch64::STURXi;
2333
  case AArch64::STRWui: return AArch64::STURWi;
2334
  case AArch64::STRBui: return AArch64::STURBi;
2335
  case AArch64::STRHui: return AArch64::STURHi;
2336
  case AArch64::STRSui: return AArch64::STURSi;
2337
  case AArch64::STRDui: return AArch64::STURDi;
2338
  case AArch64::STRQui: return AArch64::STURQi;
2339
  case AArch64::STRBBui: return AArch64::STURBBi;
2340
  case AArch64::STRHHui: return AArch64::STURHHi;
2341
  }
2342
}
2343

2344
unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2345
  switch (Opc) {
2346
  default:
2347
    return 2;
2348
  case AArch64::LDPXi:
2349
  case AArch64::LDPDi:
2350
  case AArch64::STPXi:
2351
  case AArch64::STPDi:
2352
  case AArch64::LDNPXi:
2353
  case AArch64::LDNPDi:
2354
  case AArch64::STNPXi:
2355
  case AArch64::STNPDi:
2356
  case AArch64::LDPQi:
2357
  case AArch64::STPQi:
2358
  case AArch64::LDNPQi:
2359
  case AArch64::STNPQi:
2360
  case AArch64::LDPWi:
2361
  case AArch64::LDPSi:
2362
  case AArch64::STPWi:
2363
  case AArch64::STPSi:
2364
  case AArch64::LDNPWi:
2365
  case AArch64::LDNPSi:
2366
  case AArch64::STNPWi:
2367
  case AArch64::STNPSi:
2368
  case AArch64::LDG:
2369
  case AArch64::STGPi:
2370

2371
  case AArch64::LD1B_IMM:
2372
  case AArch64::LD1B_H_IMM:
2373
  case AArch64::LD1B_S_IMM:
2374
  case AArch64::LD1B_D_IMM:
2375
  case AArch64::LD1SB_H_IMM:
2376
  case AArch64::LD1SB_S_IMM:
2377
  case AArch64::LD1SB_D_IMM:
2378
  case AArch64::LD1H_IMM:
2379
  case AArch64::LD1H_S_IMM:
2380
  case AArch64::LD1H_D_IMM:
2381
  case AArch64::LD1SH_S_IMM:
2382
  case AArch64::LD1SH_D_IMM:
2383
  case AArch64::LD1W_IMM:
2384
  case AArch64::LD1W_D_IMM:
2385
  case AArch64::LD1SW_D_IMM:
2386
  case AArch64::LD1D_IMM:
2387

2388
  case AArch64::LD2B_IMM:
2389
  case AArch64::LD2H_IMM:
2390
  case AArch64::LD2W_IMM:
2391
  case AArch64::LD2D_IMM:
2392
  case AArch64::LD3B_IMM:
2393
  case AArch64::LD3H_IMM:
2394
  case AArch64::LD3W_IMM:
2395
  case AArch64::LD3D_IMM:
2396
  case AArch64::LD4B_IMM:
2397
  case AArch64::LD4H_IMM:
2398
  case AArch64::LD4W_IMM:
2399
  case AArch64::LD4D_IMM:
2400

2401
  case AArch64::ST1B_IMM:
2402
  case AArch64::ST1B_H_IMM:
2403
  case AArch64::ST1B_S_IMM:
2404
  case AArch64::ST1B_D_IMM:
2405
  case AArch64::ST1H_IMM:
2406
  case AArch64::ST1H_S_IMM:
2407
  case AArch64::ST1H_D_IMM:
2408
  case AArch64::ST1W_IMM:
2409
  case AArch64::ST1W_D_IMM:
2410
  case AArch64::ST1D_IMM:
2411

2412
  case AArch64::ST2B_IMM:
2413
  case AArch64::ST2H_IMM:
2414
  case AArch64::ST2W_IMM:
2415
  case AArch64::ST2D_IMM:
2416
  case AArch64::ST3B_IMM:
2417
  case AArch64::ST3H_IMM:
2418
  case AArch64::ST3W_IMM:
2419
  case AArch64::ST3D_IMM:
2420
  case AArch64::ST4B_IMM:
2421
  case AArch64::ST4H_IMM:
2422
  case AArch64::ST4W_IMM:
2423
  case AArch64::ST4D_IMM:
2424

2425
  case AArch64::LD1RB_IMM:
2426
  case AArch64::LD1RB_H_IMM:
2427
  case AArch64::LD1RB_S_IMM:
2428
  case AArch64::LD1RB_D_IMM:
2429
  case AArch64::LD1RSB_H_IMM:
2430
  case AArch64::LD1RSB_S_IMM:
2431
  case AArch64::LD1RSB_D_IMM:
2432
  case AArch64::LD1RH_IMM:
2433
  case AArch64::LD1RH_S_IMM:
2434
  case AArch64::LD1RH_D_IMM:
2435
  case AArch64::LD1RSH_S_IMM:
2436
  case AArch64::LD1RSH_D_IMM:
2437
  case AArch64::LD1RW_IMM:
2438
  case AArch64::LD1RW_D_IMM:
2439
  case AArch64::LD1RSW_IMM:
2440
  case AArch64::LD1RD_IMM:
2441

2442
  case AArch64::LDNT1B_ZRI:
2443
  case AArch64::LDNT1H_ZRI:
2444
  case AArch64::LDNT1W_ZRI:
2445
  case AArch64::LDNT1D_ZRI:
2446
  case AArch64::STNT1B_ZRI:
2447
  case AArch64::STNT1H_ZRI:
2448
  case AArch64::STNT1W_ZRI:
2449
  case AArch64::STNT1D_ZRI:
2450

2451
  case AArch64::LDNF1B_IMM:
2452
  case AArch64::LDNF1B_H_IMM:
2453
  case AArch64::LDNF1B_S_IMM:
2454
  case AArch64::LDNF1B_D_IMM:
2455
  case AArch64::LDNF1SB_H_IMM:
2456
  case AArch64::LDNF1SB_S_IMM:
2457
  case AArch64::LDNF1SB_D_IMM:
2458
  case AArch64::LDNF1H_IMM:
2459
  case AArch64::LDNF1H_S_IMM:
2460
  case AArch64::LDNF1H_D_IMM:
2461
  case AArch64::LDNF1SH_S_IMM:
2462
  case AArch64::LDNF1SH_D_IMM:
2463
  case AArch64::LDNF1W_IMM:
2464
  case AArch64::LDNF1W_D_IMM:
2465
  case AArch64::LDNF1SW_D_IMM:
2466
  case AArch64::LDNF1D_IMM:
2467
    return 3;
2468
  case AArch64::ADDG:
2469
  case AArch64::STGi:
2470
  case AArch64::LDR_PXI:
2471
  case AArch64::STR_PXI:
2472
    return 2;
2473
  }
2474
}
2475

2476
bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2477
  switch (MI.getOpcode()) {
2478
  default:
2479
    return false;
2480
  // Scaled instructions.
2481
  case AArch64::STRSui:
2482
  case AArch64::STRDui:
2483
  case AArch64::STRQui:
2484
  case AArch64::STRXui:
2485
  case AArch64::STRWui:
2486
  case AArch64::LDRSui:
2487
  case AArch64::LDRDui:
2488
  case AArch64::LDRQui:
2489
  case AArch64::LDRXui:
2490
  case AArch64::LDRWui:
2491
  case AArch64::LDRSWui:
2492
  // Unscaled instructions.
2493
  case AArch64::STURSi:
2494
  case AArch64::STRSpre:
2495
  case AArch64::STURDi:
2496
  case AArch64::STRDpre:
2497
  case AArch64::STURQi:
2498
  case AArch64::STRQpre:
2499
  case AArch64::STURWi:
2500
  case AArch64::STRWpre:
2501
  case AArch64::STURXi:
2502
  case AArch64::STRXpre:
2503
  case AArch64::LDURSi:
2504
  case AArch64::LDRSpre:
2505
  case AArch64::LDURDi:
2506
  case AArch64::LDRDpre:
2507
  case AArch64::LDURQi:
2508
  case AArch64::LDRQpre:
2509
  case AArch64::LDURWi:
2510
  case AArch64::LDRWpre:
2511
  case AArch64::LDURXi:
2512
  case AArch64::LDRXpre:
2513
  case AArch64::LDURSWi:
2514
  case AArch64::LDRSWpre:
2515
    return true;
2516
  }
2517
}
2518

2519
bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2520
  switch (MI.getOpcode()) {
2521
  default:
2522
    assert((!MI.isCall() || !MI.isReturn()) &&
2523
           "Unexpected instruction - was a new tail call opcode introduced?");
2524
    return false;
2525
  case AArch64::TCRETURNdi:
2526
  case AArch64::TCRETURNri:
2527
  case AArch64::TCRETURNrix16x17:
2528
  case AArch64::TCRETURNrix17:
2529
  case AArch64::TCRETURNrinotx16:
2530
  case AArch64::TCRETURNriALL:
2531
  case AArch64::AUTH_TCRETURN:
2532
  case AArch64::AUTH_TCRETURN_BTI:
2533
    return true;
2534
  }
2535
}
2536

2537
unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2538
  switch (Opc) {
2539
  default:
2540
    llvm_unreachable("Opcode has no flag setting equivalent!");
2541
  // 32-bit cases:
2542
  case AArch64::ADDWri:
2543
    return AArch64::ADDSWri;
2544
  case AArch64::ADDWrr:
2545
    return AArch64::ADDSWrr;
2546
  case AArch64::ADDWrs:
2547
    return AArch64::ADDSWrs;
2548
  case AArch64::ADDWrx:
2549
    return AArch64::ADDSWrx;
2550
  case AArch64::ANDWri:
2551
    return AArch64::ANDSWri;
2552
  case AArch64::ANDWrr:
2553
    return AArch64::ANDSWrr;
2554
  case AArch64::ANDWrs:
2555
    return AArch64::ANDSWrs;
2556
  case AArch64::BICWrr:
2557
    return AArch64::BICSWrr;
2558
  case AArch64::BICWrs:
2559
    return AArch64::BICSWrs;
2560
  case AArch64::SUBWri:
2561
    return AArch64::SUBSWri;
2562
  case AArch64::SUBWrr:
2563
    return AArch64::SUBSWrr;
2564
  case AArch64::SUBWrs:
2565
    return AArch64::SUBSWrs;
2566
  case AArch64::SUBWrx:
2567
    return AArch64::SUBSWrx;
2568
  // 64-bit cases:
2569
  case AArch64::ADDXri:
2570
    return AArch64::ADDSXri;
2571
  case AArch64::ADDXrr:
2572
    return AArch64::ADDSXrr;
2573
  case AArch64::ADDXrs:
2574
    return AArch64::ADDSXrs;
2575
  case AArch64::ADDXrx:
2576
    return AArch64::ADDSXrx;
2577
  case AArch64::ANDXri:
2578
    return AArch64::ANDSXri;
2579
  case AArch64::ANDXrr:
2580
    return AArch64::ANDSXrr;
2581
  case AArch64::ANDXrs:
2582
    return AArch64::ANDSXrs;
2583
  case AArch64::BICXrr:
2584
    return AArch64::BICSXrr;
2585
  case AArch64::BICXrs:
2586
    return AArch64::BICSXrs;
2587
  case AArch64::SUBXri:
2588
    return AArch64::SUBSXri;
2589
  case AArch64::SUBXrr:
2590
    return AArch64::SUBSXrr;
2591
  case AArch64::SUBXrs:
2592
    return AArch64::SUBSXrs;
2593
  case AArch64::SUBXrx:
2594
    return AArch64::SUBSXrx;
2595
  // SVE instructions:
2596
  case AArch64::AND_PPzPP:
2597
    return AArch64::ANDS_PPzPP;
2598
  case AArch64::BIC_PPzPP:
2599
    return AArch64::BICS_PPzPP;
2600
  case AArch64::EOR_PPzPP:
2601
    return AArch64::EORS_PPzPP;
2602
  case AArch64::NAND_PPzPP:
2603
    return AArch64::NANDS_PPzPP;
2604
  case AArch64::NOR_PPzPP:
2605
    return AArch64::NORS_PPzPP;
2606
  case AArch64::ORN_PPzPP:
2607
    return AArch64::ORNS_PPzPP;
2608
  case AArch64::ORR_PPzPP:
2609
    return AArch64::ORRS_PPzPP;
2610
  case AArch64::BRKA_PPzP:
2611
    return AArch64::BRKAS_PPzP;
2612
  case AArch64::BRKPA_PPzPP:
2613
    return AArch64::BRKPAS_PPzPP;
2614
  case AArch64::BRKB_PPzP:
2615
    return AArch64::BRKBS_PPzP;
2616
  case AArch64::BRKPB_PPzPP:
2617
    return AArch64::BRKPBS_PPzPP;
2618
  case AArch64::BRKN_PPzP:
2619
    return AArch64::BRKNS_PPzP;
2620
  case AArch64::RDFFR_PPz:
2621
    return AArch64::RDFFRS_PPz;
2622
  case AArch64::PTRUE_B:
2623
    return AArch64::PTRUES_B;
2624
  }
2625
}
2626

2627
// Is this a candidate for ld/st merging or pairing?  For example, we don't
2628
// touch volatiles or load/stores that have a hint to avoid pair formation.
2629
bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2630

2631
  bool IsPreLdSt = isPreLdSt(MI);
2632

2633
  // If this is a volatile load/store, don't mess with it.
2634
  if (MI.hasOrderedMemoryRef())
2635
    return false;
2636

2637
  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2638
  // For Pre-inc LD/ST, the operand is shifted by one.
2639
  assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2640
          MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2641
         "Expected a reg or frame index operand.");
2642

2643
  // For Pre-indexed addressing quadword instructions, the third operand is the
2644
  // immediate value.
2645
  bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2646

2647
  if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2648
    return false;
2649

2650
  // Can't merge/pair if the instruction modifies the base register.
2651
  // e.g., ldr x0, [x0]
2652
  // This case will never occur with an FI base.
2653
  // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2654
  // STR<S,D,Q,W,X>pre, it can be merged.
2655
  // For example:
2656
  //   ldr q0, [x11, #32]!
2657
  //   ldr q1, [x11, #16]
2658
  //   to
2659
  //   ldp q0, q1, [x11, #32]!
2660
  if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2661
    Register BaseReg = MI.getOperand(1).getReg();
2662
    const TargetRegisterInfo *TRI = &getRegisterInfo();
2663
    if (MI.modifiesRegister(BaseReg, TRI))
2664
      return false;
2665
  }
2666

2667
  // Check if this load/store has a hint to avoid pair formation.
2668
  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2669
  if (isLdStPairSuppressed(MI))
2670
    return false;
2671

2672
  // Do not pair any callee-save store/reload instructions in the
2673
  // prologue/epilogue if the CFI information encoded the operations as separate
2674
  // instructions, as that will cause the size of the actual prologue to mismatch
2675
  // with the prologue size recorded in the Windows CFI.
2676
  const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2677
  bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2678
                     MI.getMF()->getFunction().needsUnwindTableEntry();
2679
  if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2680
                      MI.getFlag(MachineInstr::FrameDestroy)))
2681
    return false;
2682

2683
  // On some CPUs quad load/store pairs are slower than two single load/stores.
2684
  if (Subtarget.isPaired128Slow()) {
2685
    switch (MI.getOpcode()) {
2686
    default:
2687
      break;
2688
    case AArch64::LDURQi:
2689
    case AArch64::STURQi:
2690
    case AArch64::LDRQui:
2691
    case AArch64::STRQui:
2692
      return false;
2693
    }
2694
  }
2695

2696
  return true;
2697
}
2698

2699
bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2700
    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2701
    int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2702
    const TargetRegisterInfo *TRI) const {
2703
  if (!LdSt.mayLoadOrStore())
2704
    return false;
2705

2706
  const MachineOperand *BaseOp;
2707
  TypeSize WidthN(0, false);
2708
  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2709
                                    WidthN, TRI))
2710
    return false;
2711
  // The maximum vscale is 16 under AArch64, return the maximal extent for the
2712
  // vector.
2713
  Width = LocationSize::precise(WidthN);
2714
  BaseOps.push_back(BaseOp);
2715
  return true;
2716
}
2717

2718
std::optional<ExtAddrMode>
2719
AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2720
                                          const TargetRegisterInfo *TRI) const {
2721
  const MachineOperand *Base; // Filled with the base operand of MI.
2722
  int64_t Offset;             // Filled with the offset of MI.
2723
  bool OffsetIsScalable;
2724
  if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2725
    return std::nullopt;
2726

2727
  if (!Base->isReg())
2728
    return std::nullopt;
2729
  ExtAddrMode AM;
2730
  AM.BaseReg = Base->getReg();
2731
  AM.Displacement = Offset;
2732
  AM.ScaledReg = 0;
2733
  AM.Scale = 0;
2734
  return AM;
2735
}
2736

2737
bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2738
                                           Register Reg,
2739
                                           const MachineInstr &AddrI,
2740
                                           ExtAddrMode &AM) const {
2741
  // Filter out instructions into which we cannot fold.
2742
  unsigned NumBytes;
2743
  int64_t OffsetScale = 1;
2744
  switch (MemI.getOpcode()) {
2745
  default:
2746
    return false;
2747

2748
  case AArch64::LDURQi:
2749
  case AArch64::STURQi:
2750
    NumBytes = 16;
2751
    break;
2752

2753
  case AArch64::LDURDi:
2754
  case AArch64::STURDi:
2755
  case AArch64::LDURXi:
2756
  case AArch64::STURXi:
2757
    NumBytes = 8;
2758
    break;
2759

2760
  case AArch64::LDURWi:
2761
  case AArch64::LDURSWi:
2762
  case AArch64::STURWi:
2763
    NumBytes = 4;
2764
    break;
2765

2766
  case AArch64::LDURHi:
2767
  case AArch64::STURHi:
2768
  case AArch64::LDURHHi:
2769
  case AArch64::STURHHi:
2770
  case AArch64::LDURSHXi:
2771
  case AArch64::LDURSHWi:
2772
    NumBytes = 2;
2773
    break;
2774

2775
  case AArch64::LDRBroX:
2776
  case AArch64::LDRBBroX:
2777
  case AArch64::LDRSBXroX:
2778
  case AArch64::LDRSBWroX:
2779
  case AArch64::STRBroX:
2780
  case AArch64::STRBBroX:
2781
  case AArch64::LDURBi:
2782
  case AArch64::LDURBBi:
2783
  case AArch64::LDURSBXi:
2784
  case AArch64::LDURSBWi:
2785
  case AArch64::STURBi:
2786
  case AArch64::STURBBi:
2787
  case AArch64::LDRBui:
2788
  case AArch64::LDRBBui:
2789
  case AArch64::LDRSBXui:
2790
  case AArch64::LDRSBWui:
2791
  case AArch64::STRBui:
2792
  case AArch64::STRBBui:
2793
    NumBytes = 1;
2794
    break;
2795

2796
  case AArch64::LDRQroX:
2797
  case AArch64::STRQroX:
2798
  case AArch64::LDRQui:
2799
  case AArch64::STRQui:
2800
    NumBytes = 16;
2801
    OffsetScale = 16;
2802
    break;
2803

2804
  case AArch64::LDRDroX:
2805
  case AArch64::STRDroX:
2806
  case AArch64::LDRXroX:
2807
  case AArch64::STRXroX:
2808
  case AArch64::LDRDui:
2809
  case AArch64::STRDui:
2810
  case AArch64::LDRXui:
2811
  case AArch64::STRXui:
2812
    NumBytes = 8;
2813
    OffsetScale = 8;
2814
    break;
2815

2816
  case AArch64::LDRWroX:
2817
  case AArch64::LDRSWroX:
2818
  case AArch64::STRWroX:
2819
  case AArch64::LDRWui:
2820
  case AArch64::LDRSWui:
2821
  case AArch64::STRWui:
2822
    NumBytes = 4;
2823
    OffsetScale = 4;
2824
    break;
2825

2826
  case AArch64::LDRHroX:
2827
  case AArch64::STRHroX:
2828
  case AArch64::LDRHHroX:
2829
  case AArch64::STRHHroX:
2830
  case AArch64::LDRSHXroX:
2831
  case AArch64::LDRSHWroX:
2832
  case AArch64::LDRHui:
2833
  case AArch64::STRHui:
2834
  case AArch64::LDRHHui:
2835
  case AArch64::STRHHui:
2836
  case AArch64::LDRSHXui:
2837
  case AArch64::LDRSHWui:
2838
    NumBytes = 2;
2839
    OffsetScale = 2;
2840
    break;
2841
  }
2842

2843
  // Check the fold operand is not the loaded/stored value.
2844
  const MachineOperand &BaseRegOp = MemI.getOperand(0);
2845
  if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2846
    return false;
2847

2848
  // Handle memory instructions with a [Reg, Reg] addressing mode.
2849
  if (MemI.getOperand(2).isReg()) {
2850
    // Bail if the addressing mode already includes extension of the offset
2851
    // register.
2852
    if (MemI.getOperand(3).getImm())
2853
      return false;
2854

2855
    // Check if we actually have a scaled offset.
2856
    if (MemI.getOperand(4).getImm() == 0)
2857
      OffsetScale = 1;
2858

2859
    // If the address instructions is folded into the base register, then the
2860
    // addressing mode must not have a scale. Then we can swap the base and the
2861
    // scaled registers.
2862
    if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
2863
      return false;
2864

2865
    switch (AddrI.getOpcode()) {
2866
    default:
2867
      return false;
2868

2869
    case AArch64::SBFMXri:
2870
      // sxtw Xa, Wm
2871
      // ldr Xd, [Xn, Xa, lsl #N]
2872
      // ->
2873
      // ldr Xd, [Xn, Wm, sxtw #N]
2874
      if (AddrI.getOperand(2).getImm() != 0 ||
2875
          AddrI.getOperand(3).getImm() != 31)
2876
        return false;
2877

2878
      AM.BaseReg = MemI.getOperand(1).getReg();
2879
      if (AM.BaseReg == Reg)
2880
        AM.BaseReg = MemI.getOperand(2).getReg();
2881
      AM.ScaledReg = AddrI.getOperand(1).getReg();
2882
      AM.Scale = OffsetScale;
2883
      AM.Displacement = 0;
2884
      AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2885
      return true;
2886

2887
    case TargetOpcode::SUBREG_TO_REG: {
2888
      // mov Wa, Wm
2889
      // ldr Xd, [Xn, Xa, lsl #N]
2890
      // ->
2891
      // ldr Xd, [Xn, Wm, uxtw #N]
2892

2893
      // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2894
      if (AddrI.getOperand(1).getImm() != 0 ||
2895
          AddrI.getOperand(3).getImm() != AArch64::sub_32)
2896
        return false;
2897

2898
      const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2899
      Register OffsetReg = AddrI.getOperand(2).getReg();
2900
      if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
2901
        return false;
2902

2903
      const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
2904
      if (DefMI.getOpcode() != AArch64::ORRWrs ||
2905
          DefMI.getOperand(1).getReg() != AArch64::WZR ||
2906
          DefMI.getOperand(3).getImm() != 0)
2907
        return false;
2908

2909
      AM.BaseReg = MemI.getOperand(1).getReg();
2910
      if (AM.BaseReg == Reg)
2911
        AM.BaseReg = MemI.getOperand(2).getReg();
2912
      AM.ScaledReg = DefMI.getOperand(2).getReg();
2913
      AM.Scale = OffsetScale;
2914
      AM.Displacement = 0;
2915
      AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2916
      return true;
2917
    }
2918
    }
2919
  }
2920

2921
  // Handle memory instructions with a [Reg, #Imm] addressing mode.
2922

2923
  // Check we are not breaking a potential conversion to an LDP.
2924
  auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2925
                                 int64_t NewOffset) -> bool {
2926
    int64_t MinOffset, MaxOffset;
2927
    switch (NumBytes) {
2928
    default:
2929
      return true;
2930
    case 4:
2931
      MinOffset = -256;
2932
      MaxOffset = 252;
2933
      break;
2934
    case 8:
2935
      MinOffset = -512;
2936
      MaxOffset = 504;
2937
      break;
2938
    case 16:
2939
      MinOffset = -1024;
2940
      MaxOffset = 1008;
2941
      break;
2942
    }
2943
    return OldOffset < MinOffset || OldOffset > MaxOffset ||
2944
           (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2945
  };
2946
  auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2947
    int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
2948
    int64_t NewOffset = OldOffset + Disp;
2949
    if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
2950
      return false;
2951
    // If the old offset would fit into an LDP, but the new offset wouldn't,
2952
    // bail out.
2953
    if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2954
      return false;
2955
    AM.BaseReg = AddrI.getOperand(1).getReg();
2956
    AM.ScaledReg = 0;
2957
    AM.Scale = 0;
2958
    AM.Displacement = NewOffset;
2959
    AM.Form = ExtAddrMode::Formula::Basic;
2960
    return true;
2961
  };
2962

2963
  auto canFoldAddRegIntoAddrMode =
2964
      [&](int64_t Scale,
2965
          ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2966
    if (MemI.getOperand(2).getImm() != 0)
2967
      return false;
2968
    if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2969
      return false;
2970
    AM.BaseReg = AddrI.getOperand(1).getReg();
2971
    AM.ScaledReg = AddrI.getOperand(2).getReg();
2972
    AM.Scale = Scale;
2973
    AM.Displacement = 0;
2974
    AM.Form = Form;
2975
    return true;
2976
  };
2977

2978
  auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2979
    unsigned Opcode = MemI.getOpcode();
2980
    return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2981
           Subtarget.isSTRQroSlow();
2982
  };
2983

2984
  int64_t Disp = 0;
2985
  const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2986
  switch (AddrI.getOpcode()) {
2987
  default:
2988
    return false;
2989

2990
  case AArch64::ADDXri:
2991
    // add Xa, Xn, #N
2992
    // ldr Xd, [Xa, #M]
2993
    // ->
2994
    // ldr Xd, [Xn, #N'+M]
2995
    Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2996
    return canFoldAddSubImmIntoAddrMode(Disp);
2997

2998
  case AArch64::SUBXri:
2999
    // sub Xa, Xn, #N
3000
    // ldr Xd, [Xa, #M]
3001
    // ->
3002
    // ldr Xd, [Xn, #N'+M]
3003
    Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3004
    return canFoldAddSubImmIntoAddrMode(-Disp);
3005

3006
  case AArch64::ADDXrs: {
3007
    // add Xa, Xn, Xm, lsl #N
3008
    // ldr Xd, [Xa]
3009
    // ->
3010
    // ldr Xd, [Xn, Xm, lsl #N]
3011

3012
    // Don't fold the add if the result would be slower, unless optimising for
3013
    // size.
3014
    unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3015
    if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3016
      return false;
3017
    Shift = AArch64_AM::getShiftValue(Shift);
3018
    if (!OptSize) {
3019
      if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3020
        return false;
3021
      if (avoidSlowSTRQ(MemI))
3022
        return false;
3023
    }
3024
    return canFoldAddRegIntoAddrMode(1ULL << Shift);
3025
  }
3026

3027
  case AArch64::ADDXrr:
3028
    // add Xa, Xn, Xm
3029
    // ldr Xd, [Xa]
3030
    // ->
3031
    // ldr Xd, [Xn, Xm, lsl #0]
3032

3033
    // Don't fold the add if the result would be slower, unless optimising for
3034
    // size.
3035
    if (!OptSize && avoidSlowSTRQ(MemI))
3036
      return false;
3037
    return canFoldAddRegIntoAddrMode(1);
3038

3039
  case AArch64::ADDXrx:
3040
    // add Xa, Xn, Wm, {s,u}xtw #N
3041
    // ldr Xd, [Xa]
3042
    // ->
3043
    // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3044

3045
    // Don't fold the add if the result would be slower, unless optimising for
3046
    // size.
3047
    if (!OptSize && avoidSlowSTRQ(MemI))
3048
      return false;
3049

3050
    // Can fold only sign-/zero-extend of a word.
3051
    unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3052
    AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3053
    if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3054
      return false;
3055

3056
    return canFoldAddRegIntoAddrMode(
3057
        1ULL << AArch64_AM::getArithShiftValue(Imm),
3058
        (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3059
                                     : ExtAddrMode::Formula::ZExtScaledReg);
3060
  }
3061
}
3062

3063
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3064
// return the opcode of an instruction performing the same operation, but using
3065
// the [Reg, Reg] addressing mode.
3066
static unsigned regOffsetOpcode(unsigned Opcode) {
3067
  switch (Opcode) {
3068
  default:
3069
    llvm_unreachable("Address folding not implemented for instruction");
3070

3071
  case AArch64::LDURQi:
3072
  case AArch64::LDRQui:
3073
    return AArch64::LDRQroX;
3074
  case AArch64::STURQi:
3075
  case AArch64::STRQui:
3076
    return AArch64::STRQroX;
3077
  case AArch64::LDURDi:
3078
  case AArch64::LDRDui:
3079
    return AArch64::LDRDroX;
3080
  case AArch64::STURDi:
3081
  case AArch64::STRDui:
3082
    return AArch64::STRDroX;
3083
  case AArch64::LDURXi:
3084
  case AArch64::LDRXui:
3085
    return AArch64::LDRXroX;
3086
  case AArch64::STURXi:
3087
  case AArch64::STRXui:
3088
    return AArch64::STRXroX;
3089
  case AArch64::LDURWi:
3090
  case AArch64::LDRWui:
3091
    return AArch64::LDRWroX;
3092
  case AArch64::LDURSWi:
3093
  case AArch64::LDRSWui:
3094
    return AArch64::LDRSWroX;
3095
  case AArch64::STURWi:
3096
  case AArch64::STRWui:
3097
    return AArch64::STRWroX;
3098
  case AArch64::LDURHi:
3099
  case AArch64::LDRHui:
3100
    return AArch64::LDRHroX;
3101
  case AArch64::STURHi:
3102
  case AArch64::STRHui:
3103
    return AArch64::STRHroX;
3104
  case AArch64::LDURHHi:
3105
  case AArch64::LDRHHui:
3106
    return AArch64::LDRHHroX;
3107
  case AArch64::STURHHi:
3108
  case AArch64::STRHHui:
3109
    return AArch64::STRHHroX;
3110
  case AArch64::LDURSHXi:
3111
  case AArch64::LDRSHXui:
3112
    return AArch64::LDRSHXroX;
3113
  case AArch64::LDURSHWi:
3114
  case AArch64::LDRSHWui:
3115
    return AArch64::LDRSHWroX;
3116
  case AArch64::LDURBi:
3117
  case AArch64::LDRBui:
3118
    return AArch64::LDRBroX;
3119
  case AArch64::LDURBBi:
3120
  case AArch64::LDRBBui:
3121
    return AArch64::LDRBBroX;
3122
  case AArch64::LDURSBXi:
3123
  case AArch64::LDRSBXui:
3124
    return AArch64::LDRSBXroX;
3125
  case AArch64::LDURSBWi:
3126
  case AArch64::LDRSBWui:
3127
    return AArch64::LDRSBWroX;
3128
  case AArch64::STURBi:
3129
  case AArch64::STRBui:
3130
    return AArch64::STRBroX;
3131
  case AArch64::STURBBi:
3132
  case AArch64::STRBBui:
3133
    return AArch64::STRBBroX;
3134
  }
3135
}
3136

3137
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3138
// the opcode of an instruction performing the same operation, but using the
3139
// [Reg, #Imm] addressing mode with scaled offset.
3140
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3141
  switch (Opcode) {
3142
  default:
3143
    llvm_unreachable("Address folding not implemented for instruction");
3144

3145
  case AArch64::LDURQi:
3146
    Scale = 16;
3147
    return AArch64::LDRQui;
3148
  case AArch64::STURQi:
3149
    Scale = 16;
3150
    return AArch64::STRQui;
3151
  case AArch64::LDURDi:
3152
    Scale = 8;
3153
    return AArch64::LDRDui;
3154
  case AArch64::STURDi:
3155
    Scale = 8;
3156
    return AArch64::STRDui;
3157
  case AArch64::LDURXi:
3158
    Scale = 8;
3159
    return AArch64::LDRXui;
3160
  case AArch64::STURXi:
3161
    Scale = 8;
3162
    return AArch64::STRXui;
3163
  case AArch64::LDURWi:
3164
    Scale = 4;
3165
    return AArch64::LDRWui;
3166
  case AArch64::LDURSWi:
3167
    Scale = 4;
3168
    return AArch64::LDRSWui;
3169
  case AArch64::STURWi:
3170
    Scale = 4;
3171
    return AArch64::STRWui;
3172
  case AArch64::LDURHi:
3173
    Scale = 2;
3174
    return AArch64::LDRHui;
3175
  case AArch64::STURHi:
3176
    Scale = 2;
3177
    return AArch64::STRHui;
3178
  case AArch64::LDURHHi:
3179
    Scale = 2;
3180
    return AArch64::LDRHHui;
3181
  case AArch64::STURHHi:
3182
    Scale = 2;
3183
    return AArch64::STRHHui;
3184
  case AArch64::LDURSHXi:
3185
    Scale = 2;
3186
    return AArch64::LDRSHXui;
3187
  case AArch64::LDURSHWi:
3188
    Scale = 2;
3189
    return AArch64::LDRSHWui;
3190
  case AArch64::LDURBi:
3191
    Scale = 1;
3192
    return AArch64::LDRBui;
3193
  case AArch64::LDURBBi:
3194
    Scale = 1;
3195
    return AArch64::LDRBBui;
3196
  case AArch64::LDURSBXi:
3197
    Scale = 1;
3198
    return AArch64::LDRSBXui;
3199
  case AArch64::LDURSBWi:
3200
    Scale = 1;
3201
    return AArch64::LDRSBWui;
3202
  case AArch64::STURBi:
3203
    Scale = 1;
3204
    return AArch64::STRBui;
3205
  case AArch64::STURBBi:
3206
    Scale = 1;
3207
    return AArch64::STRBBui;
3208
  case AArch64::LDRQui:
3209
  case AArch64::STRQui:
3210
    Scale = 16;
3211
    return Opcode;
3212
  case AArch64::LDRDui:
3213
  case AArch64::STRDui:
3214
  case AArch64::LDRXui:
3215
  case AArch64::STRXui:
3216
    Scale = 8;
3217
    return Opcode;
3218
  case AArch64::LDRWui:
3219
  case AArch64::LDRSWui:
3220
  case AArch64::STRWui:
3221
    Scale = 4;
3222
    return Opcode;
3223
  case AArch64::LDRHui:
3224
  case AArch64::STRHui:
3225
  case AArch64::LDRHHui:
3226
  case AArch64::STRHHui:
3227
  case AArch64::LDRSHXui:
3228
  case AArch64::LDRSHWui:
3229
    Scale = 2;
3230
    return Opcode;
3231
  case AArch64::LDRBui:
3232
  case AArch64::LDRBBui:
3233
  case AArch64::LDRSBXui:
3234
  case AArch64::LDRSBWui:
3235
  case AArch64::STRBui:
3236
  case AArch64::STRBBui:
3237
    Scale = 1;
3238
    return Opcode;
3239
  }
3240
}
3241

3242
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3243
// the opcode of an instruction performing the same operation, but using the
3244
// [Reg, #Imm] addressing mode with unscaled offset.
3245
unsigned unscaledOffsetOpcode(unsigned Opcode) {
3246
  switch (Opcode) {
3247
  default:
3248
    llvm_unreachable("Address folding not implemented for instruction");
3249

3250
  case AArch64::LDURQi:
3251
  case AArch64::STURQi:
3252
  case AArch64::LDURDi:
3253
  case AArch64::STURDi:
3254
  case AArch64::LDURXi:
3255
  case AArch64::STURXi:
3256
  case AArch64::LDURWi:
3257
  case AArch64::LDURSWi:
3258
  case AArch64::STURWi:
3259
  case AArch64::LDURHi:
3260
  case AArch64::STURHi:
3261
  case AArch64::LDURHHi:
3262
  case AArch64::STURHHi:
3263
  case AArch64::LDURSHXi:
3264
  case AArch64::LDURSHWi:
3265
  case AArch64::LDURBi:
3266
  case AArch64::STURBi:
3267
  case AArch64::LDURBBi:
3268
  case AArch64::STURBBi:
3269
  case AArch64::LDURSBWi:
3270
  case AArch64::LDURSBXi:
3271
    return Opcode;
3272
  case AArch64::LDRQui:
3273
    return AArch64::LDURQi;
3274
  case AArch64::STRQui:
3275
    return AArch64::STURQi;
3276
  case AArch64::LDRDui:
3277
    return AArch64::LDURDi;
3278
  case AArch64::STRDui:
3279
    return AArch64::STURDi;
3280
  case AArch64::LDRXui:
3281
    return AArch64::LDURXi;
3282
  case AArch64::STRXui:
3283
    return AArch64::STURXi;
3284
  case AArch64::LDRWui:
3285
    return AArch64::LDURWi;
3286
  case AArch64::LDRSWui:
3287
    return AArch64::LDURSWi;
3288
  case AArch64::STRWui:
3289
    return AArch64::STURWi;
3290
  case AArch64::LDRHui:
3291
    return AArch64::LDURHi;
3292
  case AArch64::STRHui:
3293
    return AArch64::STURHi;
3294
  case AArch64::LDRHHui:
3295
    return AArch64::LDURHHi;
3296
  case AArch64::STRHHui:
3297
    return AArch64::STURHHi;
3298
  case AArch64::LDRSHXui:
3299
    return AArch64::LDURSHXi;
3300
  case AArch64::LDRSHWui:
3301
    return AArch64::LDURSHWi;
3302
  case AArch64::LDRBBui:
3303
    return AArch64::LDURBBi;
3304
  case AArch64::LDRBui:
3305
    return AArch64::LDURBi;
3306
  case AArch64::STRBBui:
3307
    return AArch64::STURBBi;
3308
  case AArch64::STRBui:
3309
    return AArch64::STURBi;
3310
  case AArch64::LDRSBWui:
3311
    return AArch64::LDURSBWi;
3312
  case AArch64::LDRSBXui:
3313
    return AArch64::LDURSBXi;
3314
  }
3315
}
3316

3317
// Given the opcode of a memory load/store instruction, return the opcode of an
3318
// instruction performing the same operation, but using
3319
// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3320
// offset register.
3321
static unsigned offsetExtendOpcode(unsigned Opcode) {
3322
  switch (Opcode) {
3323
  default:
3324
    llvm_unreachable("Address folding not implemented for instruction");
3325

3326
  case AArch64::LDRQroX:
3327
  case AArch64::LDURQi:
3328
  case AArch64::LDRQui:
3329
    return AArch64::LDRQroW;
3330
  case AArch64::STRQroX:
3331
  case AArch64::STURQi:
3332
  case AArch64::STRQui:
3333
    return AArch64::STRQroW;
3334
  case AArch64::LDRDroX:
3335
  case AArch64::LDURDi:
3336
  case AArch64::LDRDui:
3337
    return AArch64::LDRDroW;
3338
  case AArch64::STRDroX:
3339
  case AArch64::STURDi:
3340
  case AArch64::STRDui:
3341
    return AArch64::STRDroW;
3342
  case AArch64::LDRXroX:
3343
  case AArch64::LDURXi:
3344
  case AArch64::LDRXui:
3345
    return AArch64::LDRXroW;
3346
  case AArch64::STRXroX:
3347
  case AArch64::STURXi:
3348
  case AArch64::STRXui:
3349
    return AArch64::STRXroW;
3350
  case AArch64::LDRWroX:
3351
  case AArch64::LDURWi:
3352
  case AArch64::LDRWui:
3353
    return AArch64::LDRWroW;
3354
  case AArch64::LDRSWroX:
3355
  case AArch64::LDURSWi:
3356
  case AArch64::LDRSWui:
3357
    return AArch64::LDRSWroW;
3358
  case AArch64::STRWroX:
3359
  case AArch64::STURWi:
3360
  case AArch64::STRWui:
3361
    return AArch64::STRWroW;
3362
  case AArch64::LDRHroX:
3363
  case AArch64::LDURHi:
3364
  case AArch64::LDRHui:
3365
    return AArch64::LDRHroW;
3366
  case AArch64::STRHroX:
3367
  case AArch64::STURHi:
3368
  case AArch64::STRHui:
3369
    return AArch64::STRHroW;
3370
  case AArch64::LDRHHroX:
3371
  case AArch64::LDURHHi:
3372
  case AArch64::LDRHHui:
3373
    return AArch64::LDRHHroW;
3374
  case AArch64::STRHHroX:
3375
  case AArch64::STURHHi:
3376
  case AArch64::STRHHui:
3377
    return AArch64::STRHHroW;
3378
  case AArch64::LDRSHXroX:
3379
  case AArch64::LDURSHXi:
3380
  case AArch64::LDRSHXui:
3381
    return AArch64::LDRSHXroW;
3382
  case AArch64::LDRSHWroX:
3383
  case AArch64::LDURSHWi:
3384
  case AArch64::LDRSHWui:
3385
    return AArch64::LDRSHWroW;
3386
  case AArch64::LDRBroX:
3387
  case AArch64::LDURBi:
3388
  case AArch64::LDRBui:
3389
    return AArch64::LDRBroW;
3390
  case AArch64::LDRBBroX:
3391
  case AArch64::LDURBBi:
3392
  case AArch64::LDRBBui:
3393
    return AArch64::LDRBBroW;
3394
  case AArch64::LDRSBXroX:
3395
  case AArch64::LDURSBXi:
3396
  case AArch64::LDRSBXui:
3397
    return AArch64::LDRSBXroW;
3398
  case AArch64::LDRSBWroX:
3399
  case AArch64::LDURSBWi:
3400
  case AArch64::LDRSBWui:
3401
    return AArch64::LDRSBWroW;
3402
  case AArch64::STRBroX:
3403
  case AArch64::STURBi:
3404
  case AArch64::STRBui:
3405
    return AArch64::STRBroW;
3406
  case AArch64::STRBBroX:
3407
  case AArch64::STURBBi:
3408
  case AArch64::STRBBui:
3409
    return AArch64::STRBBroW;
3410
  }
3411
}
3412

3413
MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3414
                                                 const ExtAddrMode &AM) const {
3415

3416
  const DebugLoc &DL = MemI.getDebugLoc();
3417
  MachineBasicBlock &MBB = *MemI.getParent();
3418
  MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3419

3420
  if (AM.Form == ExtAddrMode::Formula::Basic) {
3421
    if (AM.ScaledReg) {
3422
      // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3423
      unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3424
      MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3425
      auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3426
                   .addReg(MemI.getOperand(0).getReg(),
3427
                           MemI.mayLoad() ? RegState::Define : 0)
3428
                   .addReg(AM.BaseReg)
3429
                   .addReg(AM.ScaledReg)
3430
                   .addImm(0)
3431
                   .addImm(AM.Scale > 1)
3432
                   .setMemRefs(MemI.memoperands())
3433
                   .setMIFlags(MemI.getFlags());
3434
      return B.getInstr();
3435
    }
3436

3437
    assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3438
           "Addressing mode not supported for folding");
3439

3440
    // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3441
    unsigned Scale = 1;
3442
    unsigned Opcode = MemI.getOpcode();
3443
    if (isInt<9>(AM.Displacement))
3444
      Opcode = unscaledOffsetOpcode(Opcode);
3445
    else
3446
      Opcode = scaledOffsetOpcode(Opcode, Scale);
3447

3448
    auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3449
                 .addReg(MemI.getOperand(0).getReg(),
3450
                         MemI.mayLoad() ? RegState::Define : 0)
3451
                 .addReg(AM.BaseReg)
3452
                 .addImm(AM.Displacement / Scale)
3453
                 .setMemRefs(MemI.memoperands())
3454
                 .setMIFlags(MemI.getFlags());
3455
    return B.getInstr();
3456
  }
3457

3458
  if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3459
      AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3460
    // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3461
    assert(AM.ScaledReg && !AM.Displacement &&
3462
           "Address offset can be a register or an immediate, but not both");
3463
    unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3464
    MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3465
    // Make sure the offset register is in the correct register class.
3466
    Register OffsetReg = AM.ScaledReg;
3467
    const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3468
    if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3469
      OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3470
      BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3471
          .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3472
    }
3473
    auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3474
                 .addReg(MemI.getOperand(0).getReg(),
3475
                         MemI.mayLoad() ? RegState::Define : 0)
3476
                 .addReg(AM.BaseReg)
3477
                 .addReg(OffsetReg)
3478
                 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3479
                 .addImm(AM.Scale != 1)
3480
                 .setMemRefs(MemI.memoperands())
3481
                 .setMIFlags(MemI.getFlags());
3482

3483
    return B.getInstr();
3484
  }
3485

3486
  llvm_unreachable(
3487
      "Function must not be called with an addressing mode it can't handle");
3488
}
3489

3490
bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3491
    const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3492
    bool &OffsetIsScalable, TypeSize &Width,
3493
    const TargetRegisterInfo *TRI) const {
3494
  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3495
  // Handle only loads/stores with base register followed by immediate offset.
3496
  if (LdSt.getNumExplicitOperands() == 3) {
3497
    // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3498
    if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3499
        !LdSt.getOperand(2).isImm())
3500
      return false;
3501
  } else if (LdSt.getNumExplicitOperands() == 4) {
3502
    // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3503
    if (!LdSt.getOperand(1).isReg() ||
3504
        (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3505
        !LdSt.getOperand(3).isImm())
3506
      return false;
3507
  } else
3508
    return false;
3509

3510
  // Get the scaling factor for the instruction and set the width for the
3511
  // instruction.
3512
  TypeSize Scale(0U, false);
3513
  int64_t Dummy1, Dummy2;
3514

3515
  // If this returns false, then it's an instruction we don't want to handle.
3516
  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3517
    return false;
3518

3519
  // Compute the offset. Offset is calculated as the immediate operand
3520
  // multiplied by the scaling factor. Unscaled instructions have scaling factor
3521
  // set to 1.
3522
  if (LdSt.getNumExplicitOperands() == 3) {
3523
    BaseOp = &LdSt.getOperand(1);
3524
    Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3525
  } else {
3526
    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3527
    BaseOp = &LdSt.getOperand(2);
3528
    Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3529
  }
3530
  OffsetIsScalable = Scale.isScalable();
3531

3532
  if (!BaseOp->isReg() && !BaseOp->isFI())
3533
    return false;
3534

3535
  return true;
3536
}
3537

3538
MachineOperand &
3539
AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3540
  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3541
  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3542
  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3543
  return OfsOp;
3544
}
3545

3546
bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3547
                                    TypeSize &Width, int64_t &MinOffset,
3548
                                    int64_t &MaxOffset) {
3549
  switch (Opcode) {
3550
  // Not a memory operation or something we want to handle.
3551
  default:
3552
    Scale = TypeSize::getFixed(0);
3553
    Width = TypeSize::getFixed(0);
3554
    MinOffset = MaxOffset = 0;
3555
    return false;
3556
  // LDR / STR
3557
  case AArch64::LDRQui:
3558
  case AArch64::STRQui:
3559
    Scale = TypeSize::getFixed(16);
3560
    Width = TypeSize::getFixed(16);
3561
    MinOffset = 0;
3562
    MaxOffset = 4095;
3563
    break;
3564
  case AArch64::LDRXui:
3565
  case AArch64::LDRDui:
3566
  case AArch64::STRXui:
3567
  case AArch64::STRDui:
3568
  case AArch64::PRFMui:
3569
    Scale = TypeSize::getFixed(8);
3570
    Width = TypeSize::getFixed(8);
3571
    MinOffset = 0;
3572
    MaxOffset = 4095;
3573
    break;
3574
  case AArch64::LDRWui:
3575
  case AArch64::LDRSui:
3576
  case AArch64::LDRSWui:
3577
  case AArch64::STRWui:
3578
  case AArch64::STRSui:
3579
    Scale = TypeSize::getFixed(4);
3580
    Width = TypeSize::getFixed(4);
3581
    MinOffset = 0;
3582
    MaxOffset = 4095;
3583
    break;
3584
  case AArch64::LDRHui:
3585
  case AArch64::LDRHHui:
3586
  case AArch64::LDRSHWui:
3587
  case AArch64::LDRSHXui:
3588
  case AArch64::STRHui:
3589
  case AArch64::STRHHui:
3590
    Scale = TypeSize::getFixed(2);
3591
    Width = TypeSize::getFixed(2);
3592
    MinOffset = 0;
3593
    MaxOffset = 4095;
3594
    break;
3595
  case AArch64::LDRBui:
3596
  case AArch64::LDRBBui:
3597
  case AArch64::LDRSBWui:
3598
  case AArch64::LDRSBXui:
3599
  case AArch64::STRBui:
3600
  case AArch64::STRBBui:
3601
    Scale = TypeSize::getFixed(1);
3602
    Width = TypeSize::getFixed(1);
3603
    MinOffset = 0;
3604
    MaxOffset = 4095;
3605
    break;
3606
  // post/pre inc
3607
  case AArch64::STRQpre:
3608
  case AArch64::LDRQpost:
3609
    Scale = TypeSize::getFixed(1);
3610
    Width = TypeSize::getFixed(16);
3611
    MinOffset = -256;
3612
    MaxOffset = 255;
3613
    break;
3614
  case AArch64::STRXpre:
3615
  case AArch64::STRDpre:
3616
  case AArch64::LDRXpost:
3617
  case AArch64::LDRDpost:
3618
    Scale = TypeSize::getFixed(1);
3619
    Width = TypeSize::getFixed(8);
3620
    MinOffset = -256;
3621
    MaxOffset = 255;
3622
    break;
3623
  case AArch64::STRWpost:
3624
  case AArch64::LDRWpost:
3625
    Scale = TypeSize::getFixed(4);
3626
    Width = TypeSize::getFixed(32);
3627
    MinOffset = -256;
3628
    MaxOffset = 255;
3629
    break;
3630
  // Unscaled
3631
  case AArch64::LDURQi:
3632
  case AArch64::STURQi:
3633
    Scale = TypeSize::getFixed(1);
3634
    Width = TypeSize::getFixed(16);
3635
    MinOffset = -256;
3636
    MaxOffset = 255;
3637
    break;
3638
  case AArch64::LDURXi:
3639
  case AArch64::LDURDi:
3640
  case AArch64::LDAPURXi:
3641
  case AArch64::STURXi:
3642
  case AArch64::STURDi:
3643
  case AArch64::STLURXi:
3644
  case AArch64::PRFUMi:
3645
    Scale = TypeSize::getFixed(1);
3646
    Width = TypeSize::getFixed(8);
3647
    MinOffset = -256;
3648
    MaxOffset = 255;
3649
    break;
3650
  case AArch64::LDURWi:
3651
  case AArch64::LDURSi:
3652
  case AArch64::LDURSWi:
3653
  case AArch64::LDAPURi:
3654
  case AArch64::LDAPURSWi:
3655
  case AArch64::STURWi:
3656
  case AArch64::STURSi:
3657
  case AArch64::STLURWi:
3658
    Scale = TypeSize::getFixed(1);
3659
    Width = TypeSize::getFixed(4);
3660
    MinOffset = -256;
3661
    MaxOffset = 255;
3662
    break;
3663
  case AArch64::LDURHi:
3664
  case AArch64::LDURHHi:
3665
  case AArch64::LDURSHXi:
3666
  case AArch64::LDURSHWi:
3667
  case AArch64::LDAPURHi:
3668
  case AArch64::LDAPURSHWi:
3669
  case AArch64::LDAPURSHXi:
3670
  case AArch64::STURHi:
3671
  case AArch64::STURHHi:
3672
  case AArch64::STLURHi:
3673
    Scale = TypeSize::getFixed(1);
3674
    Width = TypeSize::getFixed(2);
3675
    MinOffset = -256;
3676
    MaxOffset = 255;
3677
    break;
3678
  case AArch64::LDURBi:
3679
  case AArch64::LDURBBi:
3680
  case AArch64::LDURSBXi:
3681
  case AArch64::LDURSBWi:
3682
  case AArch64::LDAPURBi:
3683
  case AArch64::LDAPURSBWi:
3684
  case AArch64::LDAPURSBXi:
3685
  case AArch64::STURBi:
3686
  case AArch64::STURBBi:
3687
  case AArch64::STLURBi:
3688
    Scale = TypeSize::getFixed(1);
3689
    Width = TypeSize::getFixed(1);
3690
    MinOffset = -256;
3691
    MaxOffset = 255;
3692
    break;
3693
  // LDP / STP
3694
  case AArch64::LDPQi:
3695
  case AArch64::LDNPQi:
3696
  case AArch64::STPQi:
3697
  case AArch64::STNPQi:
3698
    Scale = TypeSize::getFixed(16);
3699
    Width = TypeSize::getFixed(32);
3700
    MinOffset = -64;
3701
    MaxOffset = 63;
3702
    break;
3703
  case AArch64::LDPXi:
3704
  case AArch64::LDPDi:
3705
  case AArch64::LDNPXi:
3706
  case AArch64::LDNPDi:
3707
  case AArch64::STPXi:
3708
  case AArch64::STPDi:
3709
  case AArch64::STNPXi:
3710
  case AArch64::STNPDi:
3711
    Scale = TypeSize::getFixed(8);
3712
    Width = TypeSize::getFixed(16);
3713
    MinOffset = -64;
3714
    MaxOffset = 63;
3715
    break;
3716
  case AArch64::LDPWi:
3717
  case AArch64::LDPSi:
3718
  case AArch64::LDNPWi:
3719
  case AArch64::LDNPSi:
3720
  case AArch64::STPWi:
3721
  case AArch64::STPSi:
3722
  case AArch64::STNPWi:
3723
  case AArch64::STNPSi:
3724
    Scale = TypeSize::getFixed(4);
3725
    Width = TypeSize::getFixed(8);
3726
    MinOffset = -64;
3727
    MaxOffset = 63;
3728
    break;
3729
  // pre/post inc
3730
  case AArch64::STPQpre:
3731
  case AArch64::LDPQpost:
3732
    Scale = TypeSize::getFixed(16);
3733
    Width = TypeSize::getFixed(16);
3734
    MinOffset = -1024;
3735
    MaxOffset = 1008;
3736
    break;
3737
  case AArch64::STPXpre:
3738
  case AArch64::LDPXpost:
3739
  case AArch64::STPDpre:
3740
  case AArch64::LDPDpost:
3741
    Scale = TypeSize::getFixed(8);
3742
    Width = TypeSize::getFixed(8);
3743
    MinOffset = -512;
3744
    MaxOffset = 504;
3745
    break;
3746
  case AArch64::StoreSwiftAsyncContext:
3747
    // Store is an STRXui, but there might be an ADDXri in the expansion too.
3748
    Scale = TypeSize::getFixed(1);
3749
    Width = TypeSize::getFixed(8);
3750
    MinOffset = 0;
3751
    MaxOffset = 4095;
3752
    break;
3753
  case AArch64::ADDG:
3754
    Scale = TypeSize::getFixed(16);
3755
    Width = TypeSize::getFixed(0);
3756
    MinOffset = 0;
3757
    MaxOffset = 63;
3758
    break;
3759
  case AArch64::TAGPstack:
3760
    Scale = TypeSize::getFixed(16);
3761
    Width = TypeSize::getFixed(0);
3762
    // TAGP with a negative offset turns into SUBP, which has a maximum offset
3763
    // of 63 (not 64!).
3764
    MinOffset = -63;
3765
    MaxOffset = 63;
3766
    break;
3767
  case AArch64::LDG:
3768
  case AArch64::STGi:
3769
  case AArch64::STZGi:
3770
    Scale = TypeSize::getFixed(16);
3771
    Width = TypeSize::getFixed(16);
3772
    MinOffset = -256;
3773
    MaxOffset = 255;
3774
    break;
3775
  // SVE
3776
  case AArch64::STR_ZZZZXI:
3777
  case AArch64::LDR_ZZZZXI:
3778
    Scale = TypeSize::getScalable(16);
3779
    Width = TypeSize::getScalable(16 * 4);
3780
    MinOffset = -256;
3781
    MaxOffset = 252;
3782
    break;
3783
  case AArch64::STR_ZZZXI:
3784
  case AArch64::LDR_ZZZXI:
3785
    Scale = TypeSize::getScalable(16);
3786
    Width = TypeSize::getScalable(16 * 3);
3787
    MinOffset = -256;
3788
    MaxOffset = 253;
3789
    break;
3790
  case AArch64::STR_ZZXI:
3791
  case AArch64::LDR_ZZXI:
3792
    Scale = TypeSize::getScalable(16);
3793
    Width = TypeSize::getScalable(16 * 2);
3794
    MinOffset = -256;
3795
    MaxOffset = 254;
3796
    break;
3797
  case AArch64::LDR_PXI:
3798
  case AArch64::STR_PXI:
3799
    Scale = TypeSize::getScalable(2);
3800
    Width = TypeSize::getScalable(2);
3801
    MinOffset = -256;
3802
    MaxOffset = 255;
3803
    break;
3804
  case AArch64::LDR_PPXI:
3805
  case AArch64::STR_PPXI:
3806
    Scale = TypeSize::getScalable(2);
3807
    Width = TypeSize::getScalable(2 * 2);
3808
    MinOffset = -256;
3809
    MaxOffset = 254;
3810
    break;
3811
  case AArch64::LDR_ZXI:
3812
  case AArch64::STR_ZXI:
3813
    Scale = TypeSize::getScalable(16);
3814
    Width = TypeSize::getScalable(16);
3815
    MinOffset = -256;
3816
    MaxOffset = 255;
3817
    break;
3818
  case AArch64::LD1B_IMM:
3819
  case AArch64::LD1H_IMM:
3820
  case AArch64::LD1W_IMM:
3821
  case AArch64::LD1D_IMM:
3822
  case AArch64::LDNT1B_ZRI:
3823
  case AArch64::LDNT1H_ZRI:
3824
  case AArch64::LDNT1W_ZRI:
3825
  case AArch64::LDNT1D_ZRI:
3826
  case AArch64::ST1B_IMM:
3827
  case AArch64::ST1H_IMM:
3828
  case AArch64::ST1W_IMM:
3829
  case AArch64::ST1D_IMM:
3830
  case AArch64::STNT1B_ZRI:
3831
  case AArch64::STNT1H_ZRI:
3832
  case AArch64::STNT1W_ZRI:
3833
  case AArch64::STNT1D_ZRI:
3834
  case AArch64::LDNF1B_IMM:
3835
  case AArch64::LDNF1H_IMM:
3836
  case AArch64::LDNF1W_IMM:
3837
  case AArch64::LDNF1D_IMM:
3838
    // A full vectors worth of data
3839
    // Width = mbytes * elements
3840
    Scale = TypeSize::getScalable(16);
3841
    Width = TypeSize::getScalable(16);
3842
    MinOffset = -8;
3843
    MaxOffset = 7;
3844
    break;
3845
  case AArch64::LD2B_IMM:
3846
  case AArch64::LD2H_IMM:
3847
  case AArch64::LD2W_IMM:
3848
  case AArch64::LD2D_IMM:
3849
  case AArch64::ST2B_IMM:
3850
  case AArch64::ST2H_IMM:
3851
  case AArch64::ST2W_IMM:
3852
  case AArch64::ST2D_IMM:
3853
    Scale = TypeSize::getScalable(32);
3854
    Width = TypeSize::getScalable(16 * 2);
3855
    MinOffset = -8;
3856
    MaxOffset = 7;
3857
    break;
3858
  case AArch64::LD3B_IMM:
3859
  case AArch64::LD3H_IMM:
3860
  case AArch64::LD3W_IMM:
3861
  case AArch64::LD3D_IMM:
3862
  case AArch64::ST3B_IMM:
3863
  case AArch64::ST3H_IMM:
3864
  case AArch64::ST3W_IMM:
3865
  case AArch64::ST3D_IMM:
3866
    Scale = TypeSize::getScalable(48);
3867
    Width = TypeSize::getScalable(16 * 3);
3868
    MinOffset = -8;
3869
    MaxOffset = 7;
3870
    break;
3871
  case AArch64::LD4B_IMM:
3872
  case AArch64::LD4H_IMM:
3873
  case AArch64::LD4W_IMM:
3874
  case AArch64::LD4D_IMM:
3875
  case AArch64::ST4B_IMM:
3876
  case AArch64::ST4H_IMM:
3877
  case AArch64::ST4W_IMM:
3878
  case AArch64::ST4D_IMM:
3879
    Scale = TypeSize::getScalable(64);
3880
    Width = TypeSize::getScalable(16 * 4);
3881
    MinOffset = -8;
3882
    MaxOffset = 7;
3883
    break;
3884
  case AArch64::LD1B_H_IMM:
3885
  case AArch64::LD1SB_H_IMM:
3886
  case AArch64::LD1H_S_IMM:
3887
  case AArch64::LD1SH_S_IMM:
3888
  case AArch64::LD1W_D_IMM:
3889
  case AArch64::LD1SW_D_IMM:
3890
  case AArch64::ST1B_H_IMM:
3891
  case AArch64::ST1H_S_IMM:
3892
  case AArch64::ST1W_D_IMM:
3893
  case AArch64::LDNF1B_H_IMM:
3894
  case AArch64::LDNF1SB_H_IMM:
3895
  case AArch64::LDNF1H_S_IMM:
3896
  case AArch64::LDNF1SH_S_IMM:
3897
  case AArch64::LDNF1W_D_IMM:
3898
  case AArch64::LDNF1SW_D_IMM:
3899
    // A half vector worth of data
3900
    // Width = mbytes * elements
3901
    Scale = TypeSize::getScalable(8);
3902
    Width = TypeSize::getScalable(8);
3903
    MinOffset = -8;
3904
    MaxOffset = 7;
3905
    break;
3906
  case AArch64::LD1B_S_IMM:
3907
  case AArch64::LD1SB_S_IMM:
3908
  case AArch64::LD1H_D_IMM:
3909
  case AArch64::LD1SH_D_IMM:
3910
  case AArch64::ST1B_S_IMM:
3911
  case AArch64::ST1H_D_IMM:
3912
  case AArch64::LDNF1B_S_IMM:
3913
  case AArch64::LDNF1SB_S_IMM:
3914
  case AArch64::LDNF1H_D_IMM:
3915
  case AArch64::LDNF1SH_D_IMM:
3916
    // A quarter vector worth of data
3917
    // Width = mbytes * elements
3918
    Scale = TypeSize::getScalable(4);
3919
    Width = TypeSize::getScalable(4);
3920
    MinOffset = -8;
3921
    MaxOffset = 7;
3922
    break;
3923
  case AArch64::LD1B_D_IMM:
3924
  case AArch64::LD1SB_D_IMM:
3925
  case AArch64::ST1B_D_IMM:
3926
  case AArch64::LDNF1B_D_IMM:
3927
  case AArch64::LDNF1SB_D_IMM:
3928
    // A eighth vector worth of data
3929
    // Width = mbytes * elements
3930
    Scale = TypeSize::getScalable(2);
3931
    Width = TypeSize::getScalable(2);
3932
    MinOffset = -8;
3933
    MaxOffset = 7;
3934
    break;
3935
  case AArch64::ST2Gi:
3936
  case AArch64::STZ2Gi:
3937
    Scale = TypeSize::getFixed(16);
3938
    Width = TypeSize::getFixed(32);
3939
    MinOffset = -256;
3940
    MaxOffset = 255;
3941
    break;
3942
  case AArch64::STGPi:
3943
    Scale = TypeSize::getFixed(16);
3944
    Width = TypeSize::getFixed(16);
3945
    MinOffset = -64;
3946
    MaxOffset = 63;
3947
    break;
3948
  case AArch64::LD1RB_IMM:
3949
  case AArch64::LD1RB_H_IMM:
3950
  case AArch64::LD1RB_S_IMM:
3951
  case AArch64::LD1RB_D_IMM:
3952
  case AArch64::LD1RSB_H_IMM:
3953
  case AArch64::LD1RSB_S_IMM:
3954
  case AArch64::LD1RSB_D_IMM:
3955
    Scale = TypeSize::getFixed(1);
3956
    Width = TypeSize::getFixed(1);
3957
    MinOffset = 0;
3958
    MaxOffset = 63;
3959
    break;
3960
  case AArch64::LD1RH_IMM:
3961
  case AArch64::LD1RH_S_IMM:
3962
  case AArch64::LD1RH_D_IMM:
3963
  case AArch64::LD1RSH_S_IMM:
3964
  case AArch64::LD1RSH_D_IMM:
3965
    Scale = TypeSize::getFixed(2);
3966
    Width = TypeSize::getFixed(2);
3967
    MinOffset = 0;
3968
    MaxOffset = 63;
3969
    break;
3970
  case AArch64::LD1RW_IMM:
3971
  case AArch64::LD1RW_D_IMM:
3972
  case AArch64::LD1RSW_IMM:
3973
    Scale = TypeSize::getFixed(4);
3974
    Width = TypeSize::getFixed(4);
3975
    MinOffset = 0;
3976
    MaxOffset = 63;
3977
    break;
3978
  case AArch64::LD1RD_IMM:
3979
    Scale = TypeSize::getFixed(8);
3980
    Width = TypeSize::getFixed(8);
3981
    MinOffset = 0;
3982
    MaxOffset = 63;
3983
    break;
3984
  }
3985

3986
  return true;
3987
}
3988

3989
// Scaling factor for unscaled load or store.
3990
int AArch64InstrInfo::getMemScale(unsigned Opc) {
3991
  switch (Opc) {
3992
  default:
3993
    llvm_unreachable("Opcode has unknown scale!");
3994
  case AArch64::LDRBBui:
3995
  case AArch64::LDURBBi:
3996
  case AArch64::LDRSBWui:
3997
  case AArch64::LDURSBWi:
3998
  case AArch64::STRBBui:
3999
  case AArch64::STURBBi:
4000
    return 1;
4001
  case AArch64::LDRHHui:
4002
  case AArch64::LDURHHi:
4003
  case AArch64::LDRSHWui:
4004
  case AArch64::LDURSHWi:
4005
  case AArch64::STRHHui:
4006
  case AArch64::STURHHi:
4007
    return 2;
4008
  case AArch64::LDRSui:
4009
  case AArch64::LDURSi:
4010
  case AArch64::LDRSpre:
4011
  case AArch64::LDRSWui:
4012
  case AArch64::LDURSWi:
4013
  case AArch64::LDRSWpre:
4014
  case AArch64::LDRWpre:
4015
  case AArch64::LDRWui:
4016
  case AArch64::LDURWi:
4017
  case AArch64::STRSui:
4018
  case AArch64::STURSi:
4019
  case AArch64::STRSpre:
4020
  case AArch64::STRWui:
4021
  case AArch64::STURWi:
4022
  case AArch64::STRWpre:
4023
  case AArch64::LDPSi:
4024
  case AArch64::LDPSWi:
4025
  case AArch64::LDPWi:
4026
  case AArch64::STPSi:
4027
  case AArch64::STPWi:
4028
    return 4;
4029
  case AArch64::LDRDui:
4030
  case AArch64::LDURDi:
4031
  case AArch64::LDRDpre:
4032
  case AArch64::LDRXui:
4033
  case AArch64::LDURXi:
4034
  case AArch64::LDRXpre:
4035
  case AArch64::STRDui:
4036
  case AArch64::STURDi:
4037
  case AArch64::STRDpre:
4038
  case AArch64::STRXui:
4039
  case AArch64::STURXi:
4040
  case AArch64::STRXpre:
4041
  case AArch64::LDPDi:
4042
  case AArch64::LDPXi:
4043
  case AArch64::STPDi:
4044
  case AArch64::STPXi:
4045
    return 8;
4046
  case AArch64::LDRQui:
4047
  case AArch64::LDURQi:
4048
  case AArch64::STRQui:
4049
  case AArch64::STURQi:
4050
  case AArch64::STRQpre:
4051
  case AArch64::LDPQi:
4052
  case AArch64::LDRQpre:
4053
  case AArch64::STPQi:
4054
  case AArch64::STGi:
4055
  case AArch64::STZGi:
4056
  case AArch64::ST2Gi:
4057
  case AArch64::STZ2Gi:
4058
  case AArch64::STGPi:
4059
    return 16;
4060
  }
4061
}
4062

4063
bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4064
  switch (MI.getOpcode()) {
4065
  default:
4066
    return false;
4067
  case AArch64::LDRWpre:
4068
  case AArch64::LDRXpre:
4069
  case AArch64::LDRSWpre:
4070
  case AArch64::LDRSpre:
4071
  case AArch64::LDRDpre:
4072
  case AArch64::LDRQpre:
4073
    return true;
4074
  }
4075
}
4076

4077
bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4078
  switch (MI.getOpcode()) {
4079
  default:
4080
    return false;
4081
  case AArch64::STRWpre:
4082
  case AArch64::STRXpre:
4083
  case AArch64::STRSpre:
4084
  case AArch64::STRDpre:
4085
  case AArch64::STRQpre:
4086
    return true;
4087
  }
4088
}
4089

4090
bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4091
  return isPreLd(MI) || isPreSt(MI);
4092
}
4093

4094
bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4095
  switch (MI.getOpcode()) {
4096
  default:
4097
    return false;
4098
  case AArch64::LDPSi:
4099
  case AArch64::LDPSWi:
4100
  case AArch64::LDPDi:
4101
  case AArch64::LDPQi:
4102
  case AArch64::LDPWi:
4103
  case AArch64::LDPXi:
4104
  case AArch64::STPSi:
4105
  case AArch64::STPDi:
4106
  case AArch64::STPQi:
4107
  case AArch64::STPWi:
4108
  case AArch64::STPXi:
4109
  case AArch64::STGPi:
4110
    return true;
4111
  }
4112
}
4113

4114
const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4115
  unsigned Idx =
4116
      AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4117
                                                                            : 1;
4118
  return MI.getOperand(Idx);
4119
}
4120

4121
const MachineOperand &
4122
AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4123
  unsigned Idx =
4124
      AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4125
                                                                            : 2;
4126
  return MI.getOperand(Idx);
4127
}
4128

4129
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4130
                                              Register Reg) {
4131
  if (MI.getParent() == nullptr)
4132
    return nullptr;
4133
  const MachineFunction *MF = MI.getParent()->getParent();
4134
  return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4135
}
4136

4137
bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4138
  auto IsHFPR = [&](const MachineOperand &Op) {
4139
    if (!Op.isReg())
4140
      return false;
4141
    auto Reg = Op.getReg();
4142
    if (Reg.isPhysical())
4143
      return AArch64::FPR16RegClass.contains(Reg);
4144
    const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4145
    return TRC == &AArch64::FPR16RegClass ||
4146
           TRC == &AArch64::FPR16_loRegClass;
4147
  };
4148
  return llvm::any_of(MI.operands(), IsHFPR);
4149
}
4150

4151
bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4152
  auto IsQFPR = [&](const MachineOperand &Op) {
4153
    if (!Op.isReg())
4154
      return false;
4155
    auto Reg = Op.getReg();
4156
    if (Reg.isPhysical())
4157
      return AArch64::FPR128RegClass.contains(Reg);
4158
    const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4159
    return TRC == &AArch64::FPR128RegClass ||
4160
           TRC == &AArch64::FPR128_loRegClass;
4161
  };
4162
  return llvm::any_of(MI.operands(), IsQFPR);
4163
}
4164

4165
bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4166
  switch (MI.getOpcode()) {
4167
  case AArch64::BRK:
4168
  case AArch64::HLT:
4169
  case AArch64::PACIASP:
4170
  case AArch64::PACIBSP:
4171
    // Implicit BTI behavior.
4172
    return true;
4173
  case AArch64::PAUTH_PROLOGUE:
4174
    // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4175
    return true;
4176
  case AArch64::HINT: {
4177
    unsigned Imm = MI.getOperand(0).getImm();
4178
    // Explicit BTI instruction.
4179
    if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4180
      return true;
4181
    // PACI(A|B)SP instructions.
4182
    if (Imm == 25 || Imm == 27)
4183
      return true;
4184
    return false;
4185
  }
4186
  default:
4187
    return false;
4188
  }
4189
}
4190

4191
bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4192
  if (Reg == 0)
4193
    return false;
4194
  assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4195
  return AArch64::FPR128RegClass.contains(Reg) ||
4196
         AArch64::FPR64RegClass.contains(Reg) ||
4197
         AArch64::FPR32RegClass.contains(Reg) ||
4198
         AArch64::FPR16RegClass.contains(Reg) ||
4199
         AArch64::FPR8RegClass.contains(Reg);
4200
}
4201

4202
bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4203
  auto IsFPR = [&](const MachineOperand &Op) {
4204
    if (!Op.isReg())
4205
      return false;
4206
    auto Reg = Op.getReg();
4207
    if (Reg.isPhysical())
4208
      return isFpOrNEON(Reg);
4209

4210
    const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4211
    return TRC == &AArch64::FPR128RegClass ||
4212
           TRC == &AArch64::FPR128_loRegClass ||
4213
           TRC == &AArch64::FPR64RegClass ||
4214
           TRC == &AArch64::FPR64_loRegClass ||
4215
           TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4216
           TRC == &AArch64::FPR8RegClass;
4217
  };
4218
  return llvm::any_of(MI.operands(), IsFPR);
4219
}
4220

4221
// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
4222
// scaled.
4223
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4224
  int Scale = AArch64InstrInfo::getMemScale(Opc);
4225

4226
  // If the byte-offset isn't a multiple of the stride, we can't scale this
4227
  // offset.
4228
  if (Offset % Scale != 0)
4229
    return false;
4230

4231
  // Convert the byte-offset used by unscaled into an "element" offset used
4232
  // by the scaled pair load/store instructions.
4233
  Offset /= Scale;
4234
  return true;
4235
}
4236

4237
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4238
  if (FirstOpc == SecondOpc)
4239
    return true;
4240
  // We can also pair sign-ext and zero-ext instructions.
4241
  switch (FirstOpc) {
4242
  default:
4243
    return false;
4244
  case AArch64::STRSui:
4245
  case AArch64::STURSi:
4246
    return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4247
  case AArch64::STRDui:
4248
  case AArch64::STURDi:
4249
    return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4250
  case AArch64::STRQui:
4251
  case AArch64::STURQi:
4252
    return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4253
  case AArch64::STRWui:
4254
  case AArch64::STURWi:
4255
    return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4256
  case AArch64::STRXui:
4257
  case AArch64::STURXi:
4258
    return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4259
  case AArch64::LDRSui:
4260
  case AArch64::LDURSi:
4261
    return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4262
  case AArch64::LDRDui:
4263
  case AArch64::LDURDi:
4264
    return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4265
  case AArch64::LDRQui:
4266
  case AArch64::LDURQi:
4267
    return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4268
  case AArch64::LDRWui:
4269
  case AArch64::LDURWi:
4270
    return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4271
  case AArch64::LDRSWui:
4272
  case AArch64::LDURSWi:
4273
    return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4274
  case AArch64::LDRXui:
4275
  case AArch64::LDURXi:
4276
    return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4277
  }
4278
  // These instructions can't be paired based on their opcodes.
4279
  return false;
4280
}
4281

4282
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4283
                            int64_t Offset1, unsigned Opcode1, int FI2,
4284
                            int64_t Offset2, unsigned Opcode2) {
4285
  // Accesses through fixed stack object frame indices may access a different
4286
  // fixed stack slot. Check that the object offsets + offsets match.
4287
  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4288
    int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4289
    int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4290
    assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4291
    // Convert to scaled object offsets.
4292
    int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4293
    if (ObjectOffset1 % Scale1 != 0)
4294
      return false;
4295
    ObjectOffset1 /= Scale1;
4296
    int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4297
    if (ObjectOffset2 % Scale2 != 0)
4298
      return false;
4299
    ObjectOffset2 /= Scale2;
4300
    ObjectOffset1 += Offset1;
4301
    ObjectOffset2 += Offset2;
4302
    return ObjectOffset1 + 1 == ObjectOffset2;
4303
  }
4304

4305
  return FI1 == FI2;
4306
}
4307

4308
/// Detect opportunities for ldp/stp formation.
4309
///
4310
/// Only called for LdSt for which getMemOperandWithOffset returns true.
4311
bool AArch64InstrInfo::shouldClusterMemOps(
4312
    ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4313
    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4314
    int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4315
    unsigned NumBytes) const {
4316
  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4317
  const MachineOperand &BaseOp1 = *BaseOps1.front();
4318
  const MachineOperand &BaseOp2 = *BaseOps2.front();
4319
  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4320
  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4321
  if (BaseOp1.getType() != BaseOp2.getType())
4322
    return false;
4323

4324
  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4325
         "Only base registers and frame indices are supported.");
4326

4327
  // Check for both base regs and base FI.
4328
  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4329
    return false;
4330

4331
  // Only cluster up to a single pair.
4332
  if (ClusterSize > 2)
4333
    return false;
4334

4335
  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4336
    return false;
4337

4338
  // Can we pair these instructions based on their opcodes?
4339
  unsigned FirstOpc = FirstLdSt.getOpcode();
4340
  unsigned SecondOpc = SecondLdSt.getOpcode();
4341
  if (!canPairLdStOpc(FirstOpc, SecondOpc))
4342
    return false;
4343

4344
  // Can't merge volatiles or load/stores that have a hint to avoid pair
4345
  // formation, for example.
4346
  if (!isCandidateToMergeOrPair(FirstLdSt) ||
4347
      !isCandidateToMergeOrPair(SecondLdSt))
4348
    return false;
4349

4350
  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4351
  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4352
  if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4353
    return false;
4354

4355
  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4356
  if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4357
    return false;
4358

4359
  // Pairwise instructions have a 7-bit signed offset field.
4360
  if (Offset1 > 63 || Offset1 < -64)
4361
    return false;
4362

4363
  // The caller should already have ordered First/SecondLdSt by offset.
4364
  // Note: except for non-equal frame index bases
4365
  if (BaseOp1.isFI()) {
4366
    assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4367
           "Caller should have ordered offsets.");
4368

4369
    const MachineFrameInfo &MFI =
4370
        FirstLdSt.getParent()->getParent()->getFrameInfo();
4371
    return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4372
                           BaseOp2.getIndex(), Offset2, SecondOpc);
4373
  }
4374

4375
  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4376

4377
  return Offset1 + 1 == Offset2;
4378
}
4379

4380
static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4381
                                            unsigned Reg, unsigned SubIdx,
4382
                                            unsigned State,
4383
                                            const TargetRegisterInfo *TRI) {
4384
  if (!SubIdx)
4385
    return MIB.addReg(Reg, State);
4386

4387
  if (Register::isPhysicalRegister(Reg))
4388
    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4389
  return MIB.addReg(Reg, State, SubIdx);
4390
}
4391

4392
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4393
                                        unsigned NumRegs) {
4394
  // We really want the positive remainder mod 32 here, that happens to be
4395
  // easily obtainable with a mask.
4396
  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4397
}
4398

4399
void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4400
                                        MachineBasicBlock::iterator I,
4401
                                        const DebugLoc &DL, MCRegister DestReg,
4402
                                        MCRegister SrcReg, bool KillSrc,
4403
                                        unsigned Opcode,
4404
                                        ArrayRef<unsigned> Indices) const {
4405
  assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4406
  const TargetRegisterInfo *TRI = &getRegisterInfo();
4407
  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4408
  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4409
  unsigned NumRegs = Indices.size();
4410

4411
  int SubReg = 0, End = NumRegs, Incr = 1;
4412
  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4413
    SubReg = NumRegs - 1;
4414
    End = -1;
4415
    Incr = -1;
4416
  }
4417

4418
  for (; SubReg != End; SubReg += Incr) {
4419
    const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4420
    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4421
    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4422
    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4423
  }
4424
}
4425

4426
void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4427
                                       MachineBasicBlock::iterator I,
4428
                                       DebugLoc DL, unsigned DestReg,
4429
                                       unsigned SrcReg, bool KillSrc,
4430
                                       unsigned Opcode, unsigned ZeroReg,
4431
                                       llvm::ArrayRef<unsigned> Indices) const {
4432
  const TargetRegisterInfo *TRI = &getRegisterInfo();
4433
  unsigned NumRegs = Indices.size();
4434

4435
#ifndef NDEBUG
4436
  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4437
  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4438
  assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4439
         "GPR reg sequences should not be able to overlap");
4440
#endif
4441

4442
  for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4443
    const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4444
    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4445
    MIB.addReg(ZeroReg);
4446
    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4447
    MIB.addImm(0);
4448
  }
4449
}
4450

4451
void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4452
                                   MachineBasicBlock::iterator I,
4453
                                   const DebugLoc &DL, MCRegister DestReg,
4454
                                   MCRegister SrcReg, bool KillSrc) const {
4455
  if (AArch64::GPR32spRegClass.contains(DestReg) &&
4456
      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4457
    const TargetRegisterInfo *TRI = &getRegisterInfo();
4458

4459
    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4460
      // If either operand is WSP, expand to ADD #0.
4461
      if (Subtarget.hasZeroCycleRegMove()) {
4462
        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4463
        MCRegister DestRegX = TRI->getMatchingSuperReg(
4464
            DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4465
        MCRegister SrcRegX = TRI->getMatchingSuperReg(
4466
            SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4467
        // This instruction is reading and writing X registers.  This may upset
4468
        // the register scavenger and machine verifier, so we need to indicate
4469
        // that we are reading an undefined value from SrcRegX, but a proper
4470
        // value from SrcReg.
4471
        BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4472
            .addReg(SrcRegX, RegState::Undef)
4473
            .addImm(0)
4474
            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4475
            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4476
      } else {
4477
        BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4478
            .addReg(SrcReg, getKillRegState(KillSrc))
4479
            .addImm(0)
4480
            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4481
      }
4482
    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4483
      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4484
          .addImm(0)
4485
          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4486
    } else {
4487
      if (Subtarget.hasZeroCycleRegMove()) {
4488
        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4489
        MCRegister DestRegX = TRI->getMatchingSuperReg(
4490
            DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4491
        MCRegister SrcRegX = TRI->getMatchingSuperReg(
4492
            SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4493
        // This instruction is reading and writing X registers.  This may upset
4494
        // the register scavenger and machine verifier, so we need to indicate
4495
        // that we are reading an undefined value from SrcRegX, but a proper
4496
        // value from SrcReg.
4497
        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4498
            .addReg(AArch64::XZR)
4499
            .addReg(SrcRegX, RegState::Undef)
4500
            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4501
      } else {
4502
        // Otherwise, expand to ORR WZR.
4503
        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4504
            .addReg(AArch64::WZR)
4505
            .addReg(SrcReg, getKillRegState(KillSrc));
4506
      }
4507
    }
4508
    return;
4509
  }
4510

4511
  // Copy a Predicate register by ORRing with itself.
4512
  if (AArch64::PPRRegClass.contains(DestReg) &&
4513
      AArch64::PPRRegClass.contains(SrcReg)) {
4514
    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4515
           "Unexpected SVE register.");
4516
    BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4517
      .addReg(SrcReg) // Pg
4518
      .addReg(SrcReg)
4519
      .addReg(SrcReg, getKillRegState(KillSrc));
4520
    return;
4521
  }
4522

4523
  // Copy a predicate-as-counter register by ORRing with itself as if it
4524
  // were a regular predicate (mask) register.
4525
  bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4526
  bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4527
  if (DestIsPNR || SrcIsPNR) {
4528
    auto ToPPR = [](MCRegister R) -> MCRegister {
4529
      return (R - AArch64::PN0) + AArch64::P0;
4530
    };
4531
    MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4532
    MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4533

4534
    if (PPRSrcReg != PPRDestReg) {
4535
      auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4536
                       .addReg(PPRSrcReg) // Pg
4537
                       .addReg(PPRSrcReg)
4538
                       .addReg(PPRSrcReg, getKillRegState(KillSrc));
4539
      if (DestIsPNR)
4540
        NewMI.addDef(DestReg, RegState::Implicit);
4541
    }
4542
    return;
4543
  }
4544

4545
  // Copy a Z register by ORRing with itself.
4546
  if (AArch64::ZPRRegClass.contains(DestReg) &&
4547
      AArch64::ZPRRegClass.contains(SrcReg)) {
4548
    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4549
           "Unexpected SVE register.");
4550
    BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4551
      .addReg(SrcReg)
4552
      .addReg(SrcReg, getKillRegState(KillSrc));
4553
    return;
4554
  }
4555

4556
  // Copy a Z register pair by copying the individual sub-registers.
4557
  if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4558
       AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4559
      (AArch64::ZPR2RegClass.contains(SrcReg) ||
4560
       AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4561
    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4562
           "Unexpected SVE register.");
4563
    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4564
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4565
                     Indices);
4566
    return;
4567
  }
4568

4569
  // Copy a Z register triple by copying the individual sub-registers.
4570
  if (AArch64::ZPR3RegClass.contains(DestReg) &&
4571
      AArch64::ZPR3RegClass.contains(SrcReg)) {
4572
    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4573
           "Unexpected SVE register.");
4574
    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4575
                                       AArch64::zsub2};
4576
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4577
                     Indices);
4578
    return;
4579
  }
4580

4581
  // Copy a Z register quad by copying the individual sub-registers.
4582
  if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4583
       AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4584
      (AArch64::ZPR4RegClass.contains(SrcReg) ||
4585
       AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4586
    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4587
           "Unexpected SVE register.");
4588
    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4589
                                       AArch64::zsub2, AArch64::zsub3};
4590
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4591
                     Indices);
4592
    return;
4593
  }
4594

4595
  if (AArch64::GPR64spRegClass.contains(DestReg) &&
4596
      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4597
    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4598
      // If either operand is SP, expand to ADD #0.
4599
      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4600
          .addReg(SrcReg, getKillRegState(KillSrc))
4601
          .addImm(0)
4602
          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4603
    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4604
      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4605
          .addImm(0)
4606
          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4607
    } else {
4608
      // Otherwise, expand to ORR XZR.
4609
      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4610
          .addReg(AArch64::XZR)
4611
          .addReg(SrcReg, getKillRegState(KillSrc));
4612
    }
4613
    return;
4614
  }
4615

4616
  // Copy a DDDD register quad by copying the individual sub-registers.
4617
  if (AArch64::DDDDRegClass.contains(DestReg) &&
4618
      AArch64::DDDDRegClass.contains(SrcReg)) {
4619
    static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4620
                                       AArch64::dsub2, AArch64::dsub3};
4621
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4622
                     Indices);
4623
    return;
4624
  }
4625

4626
  // Copy a DDD register triple by copying the individual sub-registers.
4627
  if (AArch64::DDDRegClass.contains(DestReg) &&
4628
      AArch64::DDDRegClass.contains(SrcReg)) {
4629
    static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4630
                                       AArch64::dsub2};
4631
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4632
                     Indices);
4633
    return;
4634
  }
4635

4636
  // Copy a DD register pair by copying the individual sub-registers.
4637
  if (AArch64::DDRegClass.contains(DestReg) &&
4638
      AArch64::DDRegClass.contains(SrcReg)) {
4639
    static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4640
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4641
                     Indices);
4642
    return;
4643
  }
4644

4645
  // Copy a QQQQ register quad by copying the individual sub-registers.
4646
  if (AArch64::QQQQRegClass.contains(DestReg) &&
4647
      AArch64::QQQQRegClass.contains(SrcReg)) {
4648
    static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4649
                                       AArch64::qsub2, AArch64::qsub3};
4650
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4651
                     Indices);
4652
    return;
4653
  }
4654

4655
  // Copy a QQQ register triple by copying the individual sub-registers.
4656
  if (AArch64::QQQRegClass.contains(DestReg) &&
4657
      AArch64::QQQRegClass.contains(SrcReg)) {
4658
    static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4659
                                       AArch64::qsub2};
4660
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4661
                     Indices);
4662
    return;
4663
  }
4664

4665
  // Copy a QQ register pair by copying the individual sub-registers.
4666
  if (AArch64::QQRegClass.contains(DestReg) &&
4667
      AArch64::QQRegClass.contains(SrcReg)) {
4668
    static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4669
    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4670
                     Indices);
4671
    return;
4672
  }
4673

4674
  if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4675
      AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4676
    static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4677
    copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4678
                    AArch64::XZR, Indices);
4679
    return;
4680
  }
4681

4682
  if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4683
      AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4684
    static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4685
    copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4686
                    AArch64::WZR, Indices);
4687
    return;
4688
  }
4689

4690
  if (AArch64::FPR128RegClass.contains(DestReg) &&
4691
      AArch64::FPR128RegClass.contains(SrcReg)) {
4692
    if (Subtarget.isSVEorStreamingSVEAvailable() &&
4693
        !Subtarget.isNeonAvailable())
4694
      BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4695
          .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4696
          .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4697
          .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4698
    else if (Subtarget.isNeonAvailable())
4699
      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4700
          .addReg(SrcReg)
4701
          .addReg(SrcReg, getKillRegState(KillSrc));
4702
    else {
4703
      BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4704
          .addReg(AArch64::SP, RegState::Define)
4705
          .addReg(SrcReg, getKillRegState(KillSrc))
4706
          .addReg(AArch64::SP)
4707
          .addImm(-16);
4708
      BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
4709
          .addReg(AArch64::SP, RegState::Define)
4710
          .addReg(DestReg, RegState::Define)
4711
          .addReg(AArch64::SP)
4712
          .addImm(16);
4713
    }
4714
    return;
4715
  }
4716

4717
  if (AArch64::FPR64RegClass.contains(DestReg) &&
4718
      AArch64::FPR64RegClass.contains(SrcReg)) {
4719
    BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4720
        .addReg(SrcReg, getKillRegState(KillSrc));
4721
    return;
4722
  }
4723

4724
  if (AArch64::FPR32RegClass.contains(DestReg) &&
4725
      AArch64::FPR32RegClass.contains(SrcReg)) {
4726
    BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4727
        .addReg(SrcReg, getKillRegState(KillSrc));
4728
    return;
4729
  }
4730

4731
  if (AArch64::FPR16RegClass.contains(DestReg) &&
4732
      AArch64::FPR16RegClass.contains(SrcReg)) {
4733
    DestReg =
4734
        RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4735
    SrcReg =
4736
        RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4737
    BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4738
        .addReg(SrcReg, getKillRegState(KillSrc));
4739
    return;
4740
  }
4741

4742
  if (AArch64::FPR8RegClass.contains(DestReg) &&
4743
      AArch64::FPR8RegClass.contains(SrcReg)) {
4744
    DestReg =
4745
        RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4746
    SrcReg =
4747
        RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4748
    BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4749
        .addReg(SrcReg, getKillRegState(KillSrc));
4750
    return;
4751
  }
4752

4753
  // Copies between GPR64 and FPR64.
4754
  if (AArch64::FPR64RegClass.contains(DestReg) &&
4755
      AArch64::GPR64RegClass.contains(SrcReg)) {
4756
    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4757
        .addReg(SrcReg, getKillRegState(KillSrc));
4758
    return;
4759
  }
4760
  if (AArch64::GPR64RegClass.contains(DestReg) &&
4761
      AArch64::FPR64RegClass.contains(SrcReg)) {
4762
    BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4763
        .addReg(SrcReg, getKillRegState(KillSrc));
4764
    return;
4765
  }
4766
  // Copies between GPR32 and FPR32.
4767
  if (AArch64::FPR32RegClass.contains(DestReg) &&
4768
      AArch64::GPR32RegClass.contains(SrcReg)) {
4769
    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4770
        .addReg(SrcReg, getKillRegState(KillSrc));
4771
    return;
4772
  }
4773
  if (AArch64::GPR32RegClass.contains(DestReg) &&
4774
      AArch64::FPR32RegClass.contains(SrcReg)) {
4775
    BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4776
        .addReg(SrcReg, getKillRegState(KillSrc));
4777
    return;
4778
  }
4779

4780
  if (DestReg == AArch64::NZCV) {
4781
    assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4782
    BuildMI(MBB, I, DL, get(AArch64::MSR))
4783
        .addImm(AArch64SysReg::NZCV)
4784
        .addReg(SrcReg, getKillRegState(KillSrc))
4785
        .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4786
    return;
4787
  }
4788

4789
  if (SrcReg == AArch64::NZCV) {
4790
    assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4791
    BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4792
        .addImm(AArch64SysReg::NZCV)
4793
        .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4794
    return;
4795
  }
4796

4797
#ifndef NDEBUG
4798
  const TargetRegisterInfo &TRI = getRegisterInfo();
4799
  errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4800
         << TRI.getRegAsmName(SrcReg) << "\n";
4801
#endif
4802
  llvm_unreachable("unimplemented reg-to-reg copy");
4803
}
4804

4805
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4806
                                    MachineBasicBlock &MBB,
4807
                                    MachineBasicBlock::iterator InsertBefore,
4808
                                    const MCInstrDesc &MCID,
4809
                                    Register SrcReg, bool IsKill,
4810
                                    unsigned SubIdx0, unsigned SubIdx1, int FI,
4811
                                    MachineMemOperand *MMO) {
4812
  Register SrcReg0 = SrcReg;
4813
  Register SrcReg1 = SrcReg;
4814
  if (SrcReg.isPhysical()) {
4815
    SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
4816
    SubIdx0 = 0;
4817
    SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
4818
    SubIdx1 = 0;
4819
  }
4820
  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4821
      .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
4822
      .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
4823
      .addFrameIndex(FI)
4824
      .addImm(0)
4825
      .addMemOperand(MMO);
4826
}
4827

4828
void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4829
                                           MachineBasicBlock::iterator MBBI,
4830
                                           Register SrcReg, bool isKill, int FI,
4831
                                           const TargetRegisterClass *RC,
4832
                                           const TargetRegisterInfo *TRI,
4833
                                           Register VReg) const {
4834
  MachineFunction &MF = *MBB.getParent();
4835
  MachineFrameInfo &MFI = MF.getFrameInfo();
4836

4837
  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4838
  MachineMemOperand *MMO =
4839
      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
4840
                              MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4841
  unsigned Opc = 0;
4842
  bool Offset = true;
4843
  MCRegister PNRReg = MCRegister::NoRegister;
4844
  unsigned StackID = TargetStackID::Default;
4845
  switch (TRI->getSpillSize(*RC)) {
4846
  case 1:
4847
    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4848
      Opc = AArch64::STRBui;
4849
    break;
4850
  case 2: {
4851
    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4852
      Opc = AArch64::STRHui;
4853
    else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
4854
             AArch64::PPRRegClass.hasSubClassEq(RC)) {
4855
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4856
             "Unexpected register store without SVE store instructions");
4857
      Opc = AArch64::STR_PXI;
4858
      StackID = TargetStackID::ScalableVector;
4859
    }
4860
    break;
4861
  }
4862
  case 4:
4863
    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4864
      Opc = AArch64::STRWui;
4865
      if (SrcReg.isVirtual())
4866
        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4867
      else
4868
        assert(SrcReg != AArch64::WSP);
4869
    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4870
      Opc = AArch64::STRSui;
4871
    else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4872
      Opc = AArch64::STR_PPXI;
4873
      StackID = TargetStackID::ScalableVector;
4874
    }
4875
    break;
4876
  case 8:
4877
    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4878
      Opc = AArch64::STRXui;
4879
      if (SrcReg.isVirtual())
4880
        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4881
      else
4882
        assert(SrcReg != AArch64::SP);
4883
    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4884
      Opc = AArch64::STRDui;
4885
    } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4886
      storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4887
                              get(AArch64::STPWi), SrcReg, isKill,
4888
                              AArch64::sube32, AArch64::subo32, FI, MMO);
4889
      return;
4890
    }
4891
    break;
4892
  case 16:
4893
    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4894
      Opc = AArch64::STRQui;
4895
    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4896
      assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4897
      Opc = AArch64::ST1Twov1d;
4898
      Offset = false;
4899
    } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4900
      storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4901
                              get(AArch64::STPXi), SrcReg, isKill,
4902
                              AArch64::sube64, AArch64::subo64, FI, MMO);
4903
      return;
4904
    } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4905
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4906
             "Unexpected register store without SVE store instructions");
4907
      Opc = AArch64::STR_ZXI;
4908
      StackID = TargetStackID::ScalableVector;
4909
    }
4910
    break;
4911
  case 24:
4912
    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4913
      assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4914
      Opc = AArch64::ST1Threev1d;
4915
      Offset = false;
4916
    }
4917
    break;
4918
  case 32:
4919
    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4920
      assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4921
      Opc = AArch64::ST1Fourv1d;
4922
      Offset = false;
4923
    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4924
      assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4925
      Opc = AArch64::ST1Twov2d;
4926
      Offset = false;
4927
    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4928
               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4929
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4930
             "Unexpected register store without SVE store instructions");
4931
      Opc = AArch64::STR_ZZXI;
4932
      StackID = TargetStackID::ScalableVector;
4933
    }
4934
    break;
4935
  case 48:
4936
    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4937
      assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4938
      Opc = AArch64::ST1Threev2d;
4939
      Offset = false;
4940
    } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4941
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4942
             "Unexpected register store without SVE store instructions");
4943
      Opc = AArch64::STR_ZZZXI;
4944
      StackID = TargetStackID::ScalableVector;
4945
    }
4946
    break;
4947
  case 64:
4948
    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4949
      assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4950
      Opc = AArch64::ST1Fourv2d;
4951
      Offset = false;
4952
    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4953
               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4954
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4955
             "Unexpected register store without SVE store instructions");
4956
      Opc = AArch64::STR_ZZZZXI;
4957
      StackID = TargetStackID::ScalableVector;
4958
    }
4959
    break;
4960
  }
4961
  assert(Opc && "Unknown register class");
4962
  MFI.setStackID(FI, StackID);
4963

4964
  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4965
                                     .addReg(SrcReg, getKillRegState(isKill))
4966
                                     .addFrameIndex(FI);
4967

4968
  if (Offset)
4969
    MI.addImm(0);
4970
  if (PNRReg.isValid())
4971
    MI.addDef(PNRReg, RegState::Implicit);
4972
  MI.addMemOperand(MMO);
4973
}
4974

4975
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4976
                                     MachineBasicBlock &MBB,
4977
                                     MachineBasicBlock::iterator InsertBefore,
4978
                                     const MCInstrDesc &MCID,
4979
                                     Register DestReg, unsigned SubIdx0,
4980
                                     unsigned SubIdx1, int FI,
4981
                                     MachineMemOperand *MMO) {
4982
  Register DestReg0 = DestReg;
4983
  Register DestReg1 = DestReg;
4984
  bool IsUndef = true;
4985
  if (DestReg.isPhysical()) {
4986
    DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
4987
    SubIdx0 = 0;
4988
    DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
4989
    SubIdx1 = 0;
4990
    IsUndef = false;
4991
  }
4992
  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4993
      .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
4994
      .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
4995
      .addFrameIndex(FI)
4996
      .addImm(0)
4997
      .addMemOperand(MMO);
4998
}
4999

5000
void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
5001
                                            MachineBasicBlock::iterator MBBI,
5002
                                            Register DestReg, int FI,
5003
                                            const TargetRegisterClass *RC,
5004
                                            const TargetRegisterInfo *TRI,
5005
                                            Register VReg) const {
5006
  MachineFunction &MF = *MBB.getParent();
5007
  MachineFrameInfo &MFI = MF.getFrameInfo();
5008
  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5009
  MachineMemOperand *MMO =
5010
      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5011
                              MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5012

5013
  unsigned Opc = 0;
5014
  bool Offset = true;
5015
  unsigned StackID = TargetStackID::Default;
5016
  Register PNRReg = MCRegister::NoRegister;
5017
  switch (TRI->getSpillSize(*RC)) {
5018
  case 1:
5019
    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5020
      Opc = AArch64::LDRBui;
5021
    break;
5022
  case 2: {
5023
    bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5024
    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5025
      Opc = AArch64::LDRHui;
5026
    else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5027
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5028
             "Unexpected register load without SVE load instructions");
5029
      if (IsPNR)
5030
        PNRReg = DestReg;
5031
      Opc = AArch64::LDR_PXI;
5032
      StackID = TargetStackID::ScalableVector;
5033
    }
5034
    break;
5035
  }
5036
  case 4:
5037
    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5038
      Opc = AArch64::LDRWui;
5039
      if (DestReg.isVirtual())
5040
        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5041
      else
5042
        assert(DestReg != AArch64::WSP);
5043
    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5044
      Opc = AArch64::LDRSui;
5045
    else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5046
      Opc = AArch64::LDR_PPXI;
5047
      StackID = TargetStackID::ScalableVector;
5048
    }
5049
    break;
5050
  case 8:
5051
    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5052
      Opc = AArch64::LDRXui;
5053
      if (DestReg.isVirtual())
5054
        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5055
      else
5056
        assert(DestReg != AArch64::SP);
5057
    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5058
      Opc = AArch64::LDRDui;
5059
    } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5060
      loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5061
                               get(AArch64::LDPWi), DestReg, AArch64::sube32,
5062
                               AArch64::subo32, FI, MMO);
5063
      return;
5064
    }
5065
    break;
5066
  case 16:
5067
    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5068
      Opc = AArch64::LDRQui;
5069
    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5070
      assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5071
      Opc = AArch64::LD1Twov1d;
5072
      Offset = false;
5073
    } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5074
      loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5075
                               get(AArch64::LDPXi), DestReg, AArch64::sube64,
5076
                               AArch64::subo64, FI, MMO);
5077
      return;
5078
    } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5079
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5080
             "Unexpected register load without SVE load instructions");
5081
      Opc = AArch64::LDR_ZXI;
5082
      StackID = TargetStackID::ScalableVector;
5083
    }
5084
    break;
5085
  case 24:
5086
    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5087
      assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5088
      Opc = AArch64::LD1Threev1d;
5089
      Offset = false;
5090
    }
5091
    break;
5092
  case 32:
5093
    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5094
      assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5095
      Opc = AArch64::LD1Fourv1d;
5096
      Offset = false;
5097
    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5098
      assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5099
      Opc = AArch64::LD1Twov2d;
5100
      Offset = false;
5101
    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5102
               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5103
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5104
             "Unexpected register load without SVE load instructions");
5105
      Opc = AArch64::LDR_ZZXI;
5106
      StackID = TargetStackID::ScalableVector;
5107
    }
5108
    break;
5109
  case 48:
5110
    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5111
      assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5112
      Opc = AArch64::LD1Threev2d;
5113
      Offset = false;
5114
    } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5115
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5116
             "Unexpected register load without SVE load instructions");
5117
      Opc = AArch64::LDR_ZZZXI;
5118
      StackID = TargetStackID::ScalableVector;
5119
    }
5120
    break;
5121
  case 64:
5122
    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5123
      assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5124
      Opc = AArch64::LD1Fourv2d;
5125
      Offset = false;
5126
    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5127
               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5128
      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5129
             "Unexpected register load without SVE load instructions");
5130
      Opc = AArch64::LDR_ZZZZXI;
5131
      StackID = TargetStackID::ScalableVector;
5132
    }
5133
    break;
5134
  }
5135

5136
  assert(Opc && "Unknown register class");
5137
  MFI.setStackID(FI, StackID);
5138

5139
  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5140
                                     .addReg(DestReg, getDefRegState(true))
5141
                                     .addFrameIndex(FI);
5142
  if (Offset)
5143
    MI.addImm(0);
5144
  if (PNRReg.isValid() && !PNRReg.isVirtual())
5145
    MI.addDef(PNRReg, RegState::Implicit);
5146
  MI.addMemOperand(MMO);
5147
}
5148

5149
bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5150
                                           const MachineInstr &UseMI,
5151
                                           const TargetRegisterInfo *TRI) {
5152
  return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5153
                                         UseMI.getIterator()),
5154
                [TRI](const MachineInstr &I) {
5155
                  return I.modifiesRegister(AArch64::NZCV, TRI) ||
5156
                         I.readsRegister(AArch64::NZCV, TRI);
5157
                });
5158
}
5159

5160
void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5161
    const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5162
  // The smallest scalable element supported by scaled SVE addressing
5163
  // modes are predicates, which are 2 scalable bytes in size. So the scalable
5164
  // byte offset must always be a multiple of 2.
5165
  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5166

5167
  // VGSized offsets are divided by '2', because the VG register is the
5168
  // the number of 64bit granules as opposed to 128bit vector chunks,
5169
  // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5170
  // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5171
  // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5172
  ByteSized = Offset.getFixed();
5173
  VGSized = Offset.getScalable() / 2;
5174
}
5175

5176
/// Returns the offset in parts to which this frame offset can be
5177
/// decomposed for the purpose of describing a frame offset.
5178
/// For non-scalable offsets this is simply its byte size.
5179
void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5180
    const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5181
    int64_t &NumDataVectors) {
5182
  // The smallest scalable element supported by scaled SVE addressing
5183
  // modes are predicates, which are 2 scalable bytes in size. So the scalable
5184
  // byte offset must always be a multiple of 2.
5185
  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5186

5187
  NumBytes = Offset.getFixed();
5188
  NumDataVectors = 0;
5189
  NumPredicateVectors = Offset.getScalable() / 2;
5190
  // This method is used to get the offsets to adjust the frame offset.
5191
  // If the function requires ADDPL to be used and needs more than two ADDPL
5192
  // instructions, part of the offset is folded into NumDataVectors so that it
5193
  // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5194
  if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5195
      NumPredicateVectors > 62) {
5196
    NumDataVectors = NumPredicateVectors / 8;
5197
    NumPredicateVectors -= NumDataVectors * 8;
5198
  }
5199
}
5200

5201
// Convenience function to create a DWARF expression for
5202
//   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
5203
static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5204
                                     int NumVGScaledBytes, unsigned VG,
5205
                                     llvm::raw_string_ostream &Comment) {
5206
  uint8_t buffer[16];
5207

5208
  if (NumBytes) {
5209
    Expr.push_back(dwarf::DW_OP_consts);
5210
    Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5211
    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5212
    Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5213
  }
5214

5215
  if (NumVGScaledBytes) {
5216
    Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5217
    Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5218

5219
    Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5220
    Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5221
    Expr.push_back(0);
5222

5223
    Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5224
    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5225

5226
    Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5227
            << std::abs(NumVGScaledBytes) << " * VG";
5228
  }
5229
}
5230

5231
// Creates an MCCFIInstruction:
5232
//    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
5233
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5234
                                               unsigned Reg,
5235
                                               const StackOffset &Offset) {
5236
  int64_t NumBytes, NumVGScaledBytes;
5237
  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5238
                                                        NumVGScaledBytes);
5239
  std::string CommentBuffer;
5240
  llvm::raw_string_ostream Comment(CommentBuffer);
5241

5242
  if (Reg == AArch64::SP)
5243
    Comment << "sp";
5244
  else if (Reg == AArch64::FP)
5245
    Comment << "fp";
5246
  else
5247
    Comment << printReg(Reg, &TRI);
5248

5249
  // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5250
  SmallString<64> Expr;
5251
  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5252
  Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5253
  Expr.push_back(0);
5254
  appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5255
                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5256

5257
  // Wrap this into DW_CFA_def_cfa.
5258
  SmallString<64> DefCfaExpr;
5259
  DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5260
  uint8_t buffer[16];
5261
  DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5262
  DefCfaExpr.append(Expr.str());
5263
  return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5264
                                        Comment.str());
5265
}
5266

5267
MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5268
                                    unsigned FrameReg, unsigned Reg,
5269
                                    const StackOffset &Offset,
5270
                                    bool LastAdjustmentWasScalable) {
5271
  if (Offset.getScalable())
5272
    return createDefCFAExpression(TRI, Reg, Offset);
5273

5274
  if (FrameReg == Reg && !LastAdjustmentWasScalable)
5275
    return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5276

5277
  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5278
  return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5279
}
5280

5281
MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5282
                                       unsigned Reg,
5283
                                       const StackOffset &OffsetFromDefCFA) {
5284
  int64_t NumBytes, NumVGScaledBytes;
5285
  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5286
      OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5287

5288
  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5289

5290
  // Non-scalable offsets can use DW_CFA_offset directly.
5291
  if (!NumVGScaledBytes)
5292
    return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5293

5294
  std::string CommentBuffer;
5295
  llvm::raw_string_ostream Comment(CommentBuffer);
5296
  Comment << printReg(Reg, &TRI) << "  @ cfa";
5297

5298
  // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5299
  SmallString<64> OffsetExpr;
5300
  appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5301
                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5302

5303
  // Wrap this into DW_CFA_expression
5304
  SmallString<64> CfaExpr;
5305
  CfaExpr.push_back(dwarf::DW_CFA_expression);
5306
  uint8_t buffer[16];
5307
  CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5308
  CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5309
  CfaExpr.append(OffsetExpr.str());
5310

5311
  return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5312
                                        Comment.str());
5313
}
5314

5315
// Helper function to emit a frame offset adjustment from a given
5316
// pointer (SrcReg), stored into DestReg. This function is explicit
5317
// in that it requires the opcode.
5318
static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5319
                               MachineBasicBlock::iterator MBBI,
5320
                               const DebugLoc &DL, unsigned DestReg,
5321
                               unsigned SrcReg, int64_t Offset, unsigned Opc,
5322
                               const TargetInstrInfo *TII,
5323
                               MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5324
                               bool *HasWinCFI, bool EmitCFAOffset,
5325
                               StackOffset CFAOffset, unsigned FrameReg) {
5326
  int Sign = 1;
5327
  unsigned MaxEncoding, ShiftSize;
5328
  switch (Opc) {
5329
  case AArch64::ADDXri:
5330
  case AArch64::ADDSXri:
5331
  case AArch64::SUBXri:
5332
  case AArch64::SUBSXri:
5333
    MaxEncoding = 0xfff;
5334
    ShiftSize = 12;
5335
    break;
5336
  case AArch64::ADDVL_XXI:
5337
  case AArch64::ADDPL_XXI:
5338
  case AArch64::ADDSVL_XXI:
5339
  case AArch64::ADDSPL_XXI:
5340
    MaxEncoding = 31;
5341
    ShiftSize = 0;
5342
    if (Offset < 0) {
5343
      MaxEncoding = 32;
5344
      Sign = -1;
5345
      Offset = -Offset;
5346
    }
5347
    break;
5348
  default:
5349
    llvm_unreachable("Unsupported opcode");
5350
  }
5351

5352
  // `Offset` can be in bytes or in "scalable bytes".
5353
  int VScale = 1;
5354
  if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5355
    VScale = 16;
5356
  else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5357
    VScale = 2;
5358

5359
  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5360
  // scratch register.  If DestReg is a virtual register, use it as the
5361
  // scratch register; otherwise, create a new virtual register (to be
5362
  // replaced by the scavenger at the end of PEI).  That case can be optimized
5363
  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5364
  // register can be loaded with offset%8 and the add/sub can use an extending
5365
  // instruction with LSL#3.
5366
  // Currently the function handles any offsets but generates a poor sequence
5367
  // of code.
5368
  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5369

5370
  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5371
  Register TmpReg = DestReg;
5372
  if (TmpReg == AArch64::XZR)
5373
    TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5374
        &AArch64::GPR64RegClass);
5375
  do {
5376
    uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5377
    unsigned LocalShiftSize = 0;
5378
    if (ThisVal > MaxEncoding) {
5379
      ThisVal = ThisVal >> ShiftSize;
5380
      LocalShiftSize = ShiftSize;
5381
    }
5382
    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5383
           "Encoding cannot handle value that big");
5384

5385
    Offset -= ThisVal << LocalShiftSize;
5386
    if (Offset == 0)
5387
      TmpReg = DestReg;
5388
    auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5389
                   .addReg(SrcReg)
5390
                   .addImm(Sign * (int)ThisVal);
5391
    if (ShiftSize)
5392
      MBI = MBI.addImm(
5393
          AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5394
    MBI = MBI.setMIFlag(Flag);
5395

5396
    auto Change =
5397
        VScale == 1
5398
            ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5399
            : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5400
    if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5401
      CFAOffset += Change;
5402
    else
5403
      CFAOffset -= Change;
5404
    if (EmitCFAOffset && DestReg == TmpReg) {
5405
      MachineFunction &MF = *MBB.getParent();
5406
      const TargetSubtargetInfo &STI = MF.getSubtarget();
5407
      const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5408

5409
      unsigned CFIIndex = MF.addFrameInst(
5410
          createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5411
      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5412
          .addCFIIndex(CFIIndex)
5413
          .setMIFlags(Flag);
5414
    }
5415

5416
    if (NeedsWinCFI) {
5417
      assert(Sign == 1 && "SEH directives should always have a positive sign");
5418
      int Imm = (int)(ThisVal << LocalShiftSize);
5419
      if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5420
          (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5421
        if (HasWinCFI)
5422
          *HasWinCFI = true;
5423
        if (Imm == 0)
5424
          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5425
        else
5426
          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5427
              .addImm(Imm)
5428
              .setMIFlag(Flag);
5429
        assert(Offset == 0 && "Expected remaining offset to be zero to "
5430
                              "emit a single SEH directive");
5431
      } else if (DestReg == AArch64::SP) {
5432
        if (HasWinCFI)
5433
          *HasWinCFI = true;
5434
        assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5435
        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5436
            .addImm(Imm)
5437
            .setMIFlag(Flag);
5438
      }
5439
    }
5440

5441
    SrcReg = TmpReg;
5442
  } while (Offset);
5443
}
5444

5445
void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5446
                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5447
                           unsigned DestReg, unsigned SrcReg,
5448
                           StackOffset Offset, const TargetInstrInfo *TII,
5449
                           MachineInstr::MIFlag Flag, bool SetNZCV,
5450
                           bool NeedsWinCFI, bool *HasWinCFI,
5451
                           bool EmitCFAOffset, StackOffset CFAOffset,
5452
                           unsigned FrameReg) {
5453
  // If a function is marked as arm_locally_streaming, then the runtime value of
5454
  // vscale in the prologue/epilogue is different the runtime value of vscale
5455
  // in the function's body. To avoid having to consider multiple vscales,
5456
  // we can use `addsvl` to allocate any scalable stack-slots, which under
5457
  // most circumstances will be only locals, not callee-save slots.
5458
  const Function &F = MBB.getParent()->getFunction();
5459
  bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5460

5461
  int64_t Bytes, NumPredicateVectors, NumDataVectors;
5462
  AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5463
      Offset, Bytes, NumPredicateVectors, NumDataVectors);
5464

5465
  // First emit non-scalable frame offsets, or a simple 'mov'.
5466
  if (Bytes || (!Offset && SrcReg != DestReg)) {
5467
    assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5468
           "SP increment/decrement not 8-byte aligned");
5469
    unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5470
    if (Bytes < 0) {
5471
      Bytes = -Bytes;
5472
      Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5473
    }
5474
    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5475
                       NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5476
                       FrameReg);
5477
    CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5478
                     ? StackOffset::getFixed(-Bytes)
5479
                     : StackOffset::getFixed(Bytes);
5480
    SrcReg = DestReg;
5481
    FrameReg = DestReg;
5482
  }
5483

5484
  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5485
         "SetNZCV not supported with SVE vectors");
5486
  assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5487
         "WinCFI not supported with SVE vectors");
5488

5489
  if (NumDataVectors) {
5490
    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5491
                       UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5492
                       TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5493
                       CFAOffset, FrameReg);
5494
    CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5495
    SrcReg = DestReg;
5496
  }
5497

5498
  if (NumPredicateVectors) {
5499
    assert(DestReg != AArch64::SP && "Unaligned access to SP");
5500
    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5501
                       UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5502
                       TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5503
                       CFAOffset, FrameReg);
5504
  }
5505
}
5506

5507
MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5508
    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5509
    MachineBasicBlock::iterator InsertPt, int FrameIndex,
5510
    LiveIntervals *LIS, VirtRegMap *VRM) const {
5511
  // This is a bit of a hack. Consider this instruction:
5512
  //
5513
  //   %0 = COPY %sp; GPR64all:%0
5514
  //
5515
  // We explicitly chose GPR64all for the virtual register so such a copy might
5516
  // be eliminated by RegisterCoalescer. However, that may not be possible, and
5517
  // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5518
  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5519
  //
5520
  // To prevent that, we are going to constrain the %0 register class here.
5521
  if (MI.isFullCopy()) {
5522
    Register DstReg = MI.getOperand(0).getReg();
5523
    Register SrcReg = MI.getOperand(1).getReg();
5524
    if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5525
      MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5526
      return nullptr;
5527
    }
5528
    if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5529
      MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5530
      return nullptr;
5531
    }
5532
    // Nothing can folded with copy from/to NZCV.
5533
    if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5534
      return nullptr;
5535
  }
5536

5537
  // Handle the case where a copy is being spilled or filled but the source
5538
  // and destination register class don't match.  For example:
5539
  //
5540
  //   %0 = COPY %xzr; GPR64common:%0
5541
  //
5542
  // In this case we can still safely fold away the COPY and generate the
5543
  // following spill code:
5544
  //
5545
  //   STRXui %xzr, %stack.0
5546
  //
5547
  // This also eliminates spilled cross register class COPYs (e.g. between x and
5548
  // d regs) of the same size.  For example:
5549
  //
5550
  //   %0 = COPY %1; GPR64:%0, FPR64:%1
5551
  //
5552
  // will be filled as
5553
  //
5554
  //   LDRDui %0, fi<#0>
5555
  //
5556
  // instead of
5557
  //
5558
  //   LDRXui %Temp, fi<#0>
5559
  //   %0 = FMOV %Temp
5560
  //
5561
  if (MI.isCopy() && Ops.size() == 1 &&
5562
      // Make sure we're only folding the explicit COPY defs/uses.
5563
      (Ops[0] == 0 || Ops[0] == 1)) {
5564
    bool IsSpill = Ops[0] == 0;
5565
    bool IsFill = !IsSpill;
5566
    const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5567
    const MachineRegisterInfo &MRI = MF.getRegInfo();
5568
    MachineBasicBlock &MBB = *MI.getParent();
5569
    const MachineOperand &DstMO = MI.getOperand(0);
5570
    const MachineOperand &SrcMO = MI.getOperand(1);
5571
    Register DstReg = DstMO.getReg();
5572
    Register SrcReg = SrcMO.getReg();
5573
    // This is slightly expensive to compute for physical regs since
5574
    // getMinimalPhysRegClass is slow.
5575
    auto getRegClass = [&](unsigned Reg) {
5576
      return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5577
                                              : TRI.getMinimalPhysRegClass(Reg);
5578
    };
5579

5580
    if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5581
      assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5582
                 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5583
             "Mismatched register size in non subreg COPY");
5584
      if (IsSpill)
5585
        storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
5586
                            getRegClass(SrcReg), &TRI, Register());
5587
      else
5588
        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
5589
                             getRegClass(DstReg), &TRI, Register());
5590
      return &*--InsertPt;
5591
    }
5592

5593
    // Handle cases like spilling def of:
5594
    //
5595
    //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5596
    //
5597
    // where the physical register source can be widened and stored to the full
5598
    // virtual reg destination stack slot, in this case producing:
5599
    //
5600
    //   STRXui %xzr, %stack.0
5601
    //
5602
    if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5603
        TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5604
      assert(SrcMO.getSubReg() == 0 &&
5605
             "Unexpected subreg on physical register");
5606
      storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5607
                          FrameIndex, &AArch64::GPR64RegClass, &TRI,
5608
                          Register());
5609
      return &*--InsertPt;
5610
    }
5611

5612
    // Handle cases like filling use of:
5613
    //
5614
    //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5615
    //
5616
    // where we can load the full virtual reg source stack slot, into the subreg
5617
    // destination, in this case producing:
5618
    //
5619
    //   LDRWui %0:sub_32<def,read-undef>, %stack.0
5620
    //
5621
    if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5622
      const TargetRegisterClass *FillRC;
5623
      switch (DstMO.getSubReg()) {
5624
      default:
5625
        FillRC = nullptr;
5626
        break;
5627
      case AArch64::sub_32:
5628
        FillRC = &AArch64::GPR32RegClass;
5629
        break;
5630
      case AArch64::ssub:
5631
        FillRC = &AArch64::FPR32RegClass;
5632
        break;
5633
      case AArch64::dsub:
5634
        FillRC = &AArch64::FPR64RegClass;
5635
        break;
5636
      }
5637

5638
      if (FillRC) {
5639
        assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5640
                   TRI.getRegSizeInBits(*FillRC) &&
5641
               "Mismatched regclass size on folded subreg COPY");
5642
        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
5643
                             Register());
5644
        MachineInstr &LoadMI = *--InsertPt;
5645
        MachineOperand &LoadDst = LoadMI.getOperand(0);
5646
        assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5647
        LoadDst.setSubReg(DstMO.getSubReg());
5648
        LoadDst.setIsUndef();
5649
        return &LoadMI;
5650
      }
5651
    }
5652
  }
5653

5654
  // Cannot fold.
5655
  return nullptr;
5656
}
5657

5658
int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5659
                                    StackOffset &SOffset,
5660
                                    bool *OutUseUnscaledOp,
5661
                                    unsigned *OutUnscaledOp,
5662
                                    int64_t *EmittableOffset) {
5663
  // Set output values in case of early exit.
5664
  if (EmittableOffset)
5665
    *EmittableOffset = 0;
5666
  if (OutUseUnscaledOp)
5667
    *OutUseUnscaledOp = false;
5668
  if (OutUnscaledOp)
5669
    *OutUnscaledOp = 0;
5670

5671
  // Exit early for structured vector spills/fills as they can't take an
5672
  // immediate offset.
5673
  switch (MI.getOpcode()) {
5674
  default:
5675
    break;
5676
  case AArch64::LD1Rv1d:
5677
  case AArch64::LD1Rv2s:
5678
  case AArch64::LD1Rv2d:
5679
  case AArch64::LD1Rv4h:
5680
  case AArch64::LD1Rv4s:
5681
  case AArch64::LD1Rv8b:
5682
  case AArch64::LD1Rv8h:
5683
  case AArch64::LD1Rv16b:
5684
  case AArch64::LD1Twov2d:
5685
  case AArch64::LD1Threev2d:
5686
  case AArch64::LD1Fourv2d:
5687
  case AArch64::LD1Twov1d:
5688
  case AArch64::LD1Threev1d:
5689
  case AArch64::LD1Fourv1d:
5690
  case AArch64::ST1Twov2d:
5691
  case AArch64::ST1Threev2d:
5692
  case AArch64::ST1Fourv2d:
5693
  case AArch64::ST1Twov1d:
5694
  case AArch64::ST1Threev1d:
5695
  case AArch64::ST1Fourv1d:
5696
  case AArch64::ST1i8:
5697
  case AArch64::ST1i16:
5698
  case AArch64::ST1i32:
5699
  case AArch64::ST1i64:
5700
  case AArch64::IRG:
5701
  case AArch64::IRGstack:
5702
  case AArch64::STGloop:
5703
  case AArch64::STZGloop:
5704
    return AArch64FrameOffsetCannotUpdate;
5705
  }
5706

5707
  // Get the min/max offset and the scale.
5708
  TypeSize ScaleValue(0U, false), Width(0U, false);
5709
  int64_t MinOff, MaxOff;
5710
  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
5711
                                      MaxOff))
5712
    llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5713

5714
  // Construct the complete offset.
5715
  bool IsMulVL = ScaleValue.isScalable();
5716
  unsigned Scale = ScaleValue.getKnownMinValue();
5717
  int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5718

5719
  const MachineOperand &ImmOpnd =
5720
      MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
5721
  Offset += ImmOpnd.getImm() * Scale;
5722

5723
  // If the offset doesn't match the scale, we rewrite the instruction to
5724
  // use the unscaled instruction instead. Likewise, if we have a negative
5725
  // offset and there is an unscaled op to use.
5726
  std::optional<unsigned> UnscaledOp =
5727
      AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
5728
  bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5729
  if (useUnscaledOp &&
5730
      !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
5731
                                      MaxOff))
5732
    llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5733

5734
  Scale = ScaleValue.getKnownMinValue();
5735
  assert(IsMulVL == ScaleValue.isScalable() &&
5736
         "Unscaled opcode has different value for scalable");
5737

5738
  int64_t Remainder = Offset % Scale;
5739
  assert(!(Remainder && useUnscaledOp) &&
5740
         "Cannot have remainder when using unscaled op");
5741

5742
  assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5743
  int64_t NewOffset = Offset / Scale;
5744
  if (MinOff <= NewOffset && NewOffset <= MaxOff)
5745
    Offset = Remainder;
5746
  else {
5747
    NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5748
    Offset = Offset - (NewOffset * Scale);
5749
  }
5750

5751
  if (EmittableOffset)
5752
    *EmittableOffset = NewOffset;
5753
  if (OutUseUnscaledOp)
5754
    *OutUseUnscaledOp = useUnscaledOp;
5755
  if (OutUnscaledOp && UnscaledOp)
5756
    *OutUnscaledOp = *UnscaledOp;
5757

5758
  if (IsMulVL)
5759
    SOffset = StackOffset::get(SOffset.getFixed(), Offset);
5760
  else
5761
    SOffset = StackOffset::get(Offset, SOffset.getScalable());
5762
  return AArch64FrameOffsetCanUpdate |
5763
         (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5764
}
5765

5766
bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5767
                                    unsigned FrameReg, StackOffset &Offset,
5768
                                    const AArch64InstrInfo *TII) {
5769
  unsigned Opcode = MI.getOpcode();
5770
  unsigned ImmIdx = FrameRegIdx + 1;
5771

5772
  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5773
    Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
5774
    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5775
                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5776
                    MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5777
    MI.eraseFromParent();
5778
    Offset = StackOffset();
5779
    return true;
5780
  }
5781

5782
  int64_t NewOffset;
5783
  unsigned UnscaledOp;
5784
  bool UseUnscaledOp;
5785
  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
5786
                                         &UnscaledOp, &NewOffset);
5787
  if (Status & AArch64FrameOffsetCanUpdate) {
5788
    if (Status & AArch64FrameOffsetIsLegal)
5789
      // Replace the FrameIndex with FrameReg.
5790
      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
5791
    if (UseUnscaledOp)
5792
      MI.setDesc(TII->get(UnscaledOp));
5793

5794
    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
5795
    return !Offset;
5796
  }
5797

5798
  return false;
5799
}
5800

5801
void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5802
                                  MachineBasicBlock::iterator MI) const {
5803
  DebugLoc DL;
5804
  BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5805
}
5806

5807
MCInst AArch64InstrInfo::getNop() const {
5808
  return MCInstBuilder(AArch64::HINT).addImm(0);
5809
}
5810

5811
// AArch64 supports MachineCombiner.
5812
bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5813

5814
// True when Opc sets flag
5815
static bool isCombineInstrSettingFlag(unsigned Opc) {
5816
  switch (Opc) {
5817
  case AArch64::ADDSWrr:
5818
  case AArch64::ADDSWri:
5819
  case AArch64::ADDSXrr:
5820
  case AArch64::ADDSXri:
5821
  case AArch64::SUBSWrr:
5822
  case AArch64::SUBSXrr:
5823
  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5824
  case AArch64::SUBSWri:
5825
  case AArch64::SUBSXri:
5826
    return true;
5827
  default:
5828
    break;
5829
  }
5830
  return false;
5831
}
5832

5833
// 32b Opcodes that can be combined with a MUL
5834
static bool isCombineInstrCandidate32(unsigned Opc) {
5835
  switch (Opc) {
5836
  case AArch64::ADDWrr:
5837
  case AArch64::ADDWri:
5838
  case AArch64::SUBWrr:
5839
  case AArch64::ADDSWrr:
5840
  case AArch64::ADDSWri:
5841
  case AArch64::SUBSWrr:
5842
  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5843
  case AArch64::SUBWri:
5844
  case AArch64::SUBSWri:
5845
    return true;
5846
  default:
5847
    break;
5848
  }
5849
  return false;
5850
}
5851

5852
// 64b Opcodes that can be combined with a MUL
5853
static bool isCombineInstrCandidate64(unsigned Opc) {
5854
  switch (Opc) {
5855
  case AArch64::ADDXrr:
5856
  case AArch64::ADDXri:
5857
  case AArch64::SUBXrr:
5858
  case AArch64::ADDSXrr:
5859
  case AArch64::ADDSXri:
5860
  case AArch64::SUBSXrr:
5861
  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5862
  case AArch64::SUBXri:
5863
  case AArch64::SUBSXri:
5864
  case AArch64::ADDv8i8:
5865
  case AArch64::ADDv16i8:
5866
  case AArch64::ADDv4i16:
5867
  case AArch64::ADDv8i16:
5868
  case AArch64::ADDv2i32:
5869
  case AArch64::ADDv4i32:
5870
  case AArch64::SUBv8i8:
5871
  case AArch64::SUBv16i8:
5872
  case AArch64::SUBv4i16:
5873
  case AArch64::SUBv8i16:
5874
  case AArch64::SUBv2i32:
5875
  case AArch64::SUBv4i32:
5876
    return true;
5877
  default:
5878
    break;
5879
  }
5880
  return false;
5881
}
5882

5883
// FP Opcodes that can be combined with a FMUL.
5884
static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5885
  switch (Inst.getOpcode()) {
5886
  default:
5887
    break;
5888
  case AArch64::FADDHrr:
5889
  case AArch64::FADDSrr:
5890
  case AArch64::FADDDrr:
5891
  case AArch64::FADDv4f16:
5892
  case AArch64::FADDv8f16:
5893
  case AArch64::FADDv2f32:
5894
  case AArch64::FADDv2f64:
5895
  case AArch64::FADDv4f32:
5896
  case AArch64::FSUBHrr:
5897
  case AArch64::FSUBSrr:
5898
  case AArch64::FSUBDrr:
5899
  case AArch64::FSUBv4f16:
5900
  case AArch64::FSUBv8f16:
5901
  case AArch64::FSUBv2f32:
5902
  case AArch64::FSUBv2f64:
5903
  case AArch64::FSUBv4f32:
5904
    TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5905
    // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5906
    // the target options or if FADD/FSUB has the contract fast-math flag.
5907
    return Options.UnsafeFPMath ||
5908
           Options.AllowFPOpFusion == FPOpFusion::Fast ||
5909
           Inst.getFlag(MachineInstr::FmContract);
5910
    return true;
5911
  }
5912
  return false;
5913
}
5914

5915
// Opcodes that can be combined with a MUL
5916
static bool isCombineInstrCandidate(unsigned Opc) {
5917
  return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5918
}
5919

5920
//
5921
// Utility routine that checks if \param MO is defined by an
5922
// \param CombineOpc instruction in the basic block \param MBB
5923
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5924
                       unsigned CombineOpc, unsigned ZeroReg = 0,
5925
                       bool CheckZeroReg = false) {
5926
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5927
  MachineInstr *MI = nullptr;
5928

5929
  if (MO.isReg() && MO.getReg().isVirtual())
5930
    MI = MRI.getUniqueVRegDef(MO.getReg());
5931
  // And it needs to be in the trace (otherwise, it won't have a depth).
5932
  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5933
    return false;
5934
  // Must only used by the user we combine with.
5935
  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
5936
    return false;
5937

5938
  if (CheckZeroReg) {
5939
    assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5940
           MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5941
           MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5942
    // The third input reg must be zero.
5943
    if (MI->getOperand(3).getReg() != ZeroReg)
5944
      return false;
5945
  }
5946

5947
  if (isCombineInstrSettingFlag(CombineOpc) &&
5948
      MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
5949
    return false;
5950

5951
  return true;
5952
}
5953

5954
//
5955
// Is \param MO defined by an integer multiply and can be combined?
5956
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5957
                              unsigned MulOpc, unsigned ZeroReg) {
5958
  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
5959
}
5960

5961
//
5962
// Is \param MO defined by a floating-point multiply and can be combined?
5963
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5964
                               unsigned MulOpc) {
5965
  return canCombine(MBB, MO, MulOpc);
5966
}
5967

5968
// TODO: There are many more machine instruction opcodes to match:
5969
//       1. Other data types (integer, vectors)
5970
//       2. Other math / logic operations (xor, or)
5971
//       3. Other forms of the same operation (intrinsics and other variants)
5972
bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5973
                                                   bool Invert) const {
5974
  if (Invert)
5975
    return false;
5976
  switch (Inst.getOpcode()) {
5977
  // == Floating-point types ==
5978
  // -- Floating-point instructions --
5979
  case AArch64::FADDHrr:
5980
  case AArch64::FADDSrr:
5981
  case AArch64::FADDDrr:
5982
  case AArch64::FMULHrr:
5983
  case AArch64::FMULSrr:
5984
  case AArch64::FMULDrr:
5985
  case AArch64::FMULX16:
5986
  case AArch64::FMULX32:
5987
  case AArch64::FMULX64:
5988
  // -- Advanced SIMD instructions --
5989
  case AArch64::FADDv4f16:
5990
  case AArch64::FADDv8f16:
5991
  case AArch64::FADDv2f32:
5992
  case AArch64::FADDv4f32:
5993
  case AArch64::FADDv2f64:
5994
  case AArch64::FMULv4f16:
5995
  case AArch64::FMULv8f16:
5996
  case AArch64::FMULv2f32:
5997
  case AArch64::FMULv4f32:
5998
  case AArch64::FMULv2f64:
5999
  case AArch64::FMULXv4f16:
6000
  case AArch64::FMULXv8f16:
6001
  case AArch64::FMULXv2f32:
6002
  case AArch64::FMULXv4f32:
6003
  case AArch64::FMULXv2f64:
6004
  // -- SVE instructions --
6005
  // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6006
  // in the SVE instruction set (though there are predicated ones).
6007
  case AArch64::FADD_ZZZ_H:
6008
  case AArch64::FADD_ZZZ_S:
6009
  case AArch64::FADD_ZZZ_D:
6010
  case AArch64::FMUL_ZZZ_H:
6011
  case AArch64::FMUL_ZZZ_S:
6012
  case AArch64::FMUL_ZZZ_D:
6013
    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6014
           (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6015
            Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6016

6017
  // == Integer types ==
6018
  // -- Base instructions --
6019
  // Opcodes MULWrr and MULXrr don't exist because
6020
  // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6021
  // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6022
  // The machine-combiner does not support three-source-operands machine
6023
  // instruction. So we cannot reassociate MULs.
6024
  case AArch64::ADDWrr:
6025
  case AArch64::ADDXrr:
6026
  case AArch64::ANDWrr:
6027
  case AArch64::ANDXrr:
6028
  case AArch64::ORRWrr:
6029
  case AArch64::ORRXrr:
6030
  case AArch64::EORWrr:
6031
  case AArch64::EORXrr:
6032
  case AArch64::EONWrr:
6033
  case AArch64::EONXrr:
6034
  // -- Advanced SIMD instructions --
6035
  // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6036
  // in the Advanced SIMD instruction set.
6037
  case AArch64::ADDv8i8:
6038
  case AArch64::ADDv16i8:
6039
  case AArch64::ADDv4i16:
6040
  case AArch64::ADDv8i16:
6041
  case AArch64::ADDv2i32:
6042
  case AArch64::ADDv4i32:
6043
  case AArch64::ADDv1i64:
6044
  case AArch64::ADDv2i64:
6045
  case AArch64::MULv8i8:
6046
  case AArch64::MULv16i8:
6047
  case AArch64::MULv4i16:
6048
  case AArch64::MULv8i16:
6049
  case AArch64::MULv2i32:
6050
  case AArch64::MULv4i32:
6051
  case AArch64::ANDv8i8:
6052
  case AArch64::ANDv16i8:
6053
  case AArch64::ORRv8i8:
6054
  case AArch64::ORRv16i8:
6055
  case AArch64::EORv8i8:
6056
  case AArch64::EORv16i8:
6057
  // -- SVE instructions --
6058
  case AArch64::ADD_ZZZ_B:
6059
  case AArch64::ADD_ZZZ_H:
6060
  case AArch64::ADD_ZZZ_S:
6061
  case AArch64::ADD_ZZZ_D:
6062
  case AArch64::MUL_ZZZ_B:
6063
  case AArch64::MUL_ZZZ_H:
6064
  case AArch64::MUL_ZZZ_S:
6065
  case AArch64::MUL_ZZZ_D:
6066
  case AArch64::AND_ZZZ:
6067
  case AArch64::ORR_ZZZ:
6068
  case AArch64::EOR_ZZZ:
6069
    return true;
6070

6071
  default:
6072
    return false;
6073
  }
6074
}
6075

6076
/// Find instructions that can be turned into madd.
6077
static bool getMaddPatterns(MachineInstr &Root,
6078
                            SmallVectorImpl<unsigned> &Patterns) {
6079
  unsigned Opc = Root.getOpcode();
6080
  MachineBasicBlock &MBB = *Root.getParent();
6081
  bool Found = false;
6082

6083
  if (!isCombineInstrCandidate(Opc))
6084
    return false;
6085
  if (isCombineInstrSettingFlag(Opc)) {
6086
    int Cmp_NZCV =
6087
        Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6088
    // When NZCV is live bail out.
6089
    if (Cmp_NZCV == -1)
6090
      return false;
6091
    unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6092
    // When opcode can't change bail out.
6093
    // CHECKME: do we miss any cases for opcode conversion?
6094
    if (NewOpc == Opc)
6095
      return false;
6096
    Opc = NewOpc;
6097
  }
6098

6099
  auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6100
                      unsigned Pattern) {
6101
    if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6102
      Patterns.push_back(Pattern);
6103
      Found = true;
6104
    }
6105
  };
6106

6107
  auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6108
    if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6109
      Patterns.push_back(Pattern);
6110
      Found = true;
6111
    }
6112
  };
6113

6114
  typedef AArch64MachineCombinerPattern MCP;
6115

6116
  switch (Opc) {
6117
  default:
6118
    break;
6119
  case AArch64::ADDWrr:
6120
    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6121
           "ADDWrr does not have register operands");
6122
    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6123
    setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6124
    break;
6125
  case AArch64::ADDXrr:
6126
    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6127
    setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6128
    break;
6129
  case AArch64::SUBWrr:
6130
    setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6131
    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6132
    break;
6133
  case AArch64::SUBXrr:
6134
    setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6135
    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6136
    break;
6137
  case AArch64::ADDWri:
6138
    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6139
    break;
6140
  case AArch64::ADDXri:
6141
    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6142
    break;
6143
  case AArch64::SUBWri:
6144
    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6145
    break;
6146
  case AArch64::SUBXri:
6147
    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6148
    break;
6149
  case AArch64::ADDv8i8:
6150
    setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6151
    setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6152
    break;
6153
  case AArch64::ADDv16i8:
6154
    setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6155
    setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6156
    break;
6157
  case AArch64::ADDv4i16:
6158
    setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6159
    setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6160
    setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6161
    setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6162
    break;
6163
  case AArch64::ADDv8i16:
6164
    setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6165
    setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6166
    setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6167
    setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6168
    break;
6169
  case AArch64::ADDv2i32:
6170
    setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6171
    setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6172
    setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6173
    setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6174
    break;
6175
  case AArch64::ADDv4i32:
6176
    setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6177
    setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6178
    setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6179
    setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6180
    break;
6181
  case AArch64::SUBv8i8:
6182
    setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6183
    setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6184
    break;
6185
  case AArch64::SUBv16i8:
6186
    setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6187
    setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6188
    break;
6189
  case AArch64::SUBv4i16:
6190
    setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6191
    setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6192
    setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6193
    setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6194
    break;
6195
  case AArch64::SUBv8i16:
6196
    setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6197
    setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6198
    setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6199
    setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6200
    break;
6201
  case AArch64::SUBv2i32:
6202
    setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6203
    setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6204
    setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6205
    setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6206
    break;
6207
  case AArch64::SUBv4i32:
6208
    setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6209
    setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6210
    setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6211
    setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6212
    break;
6213
  }
6214
  return Found;
6215
}
6216
/// Floating-Point Support
6217

6218
/// Find instructions that can be turned into madd.
6219
static bool getFMAPatterns(MachineInstr &Root,
6220
                           SmallVectorImpl<unsigned> &Patterns) {
6221

6222
  if (!isCombineInstrCandidateFP(Root))
6223
    return false;
6224

6225
  MachineBasicBlock &MBB = *Root.getParent();
6226
  bool Found = false;
6227

6228
  auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6229
    if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6230
      Patterns.push_back(Pattern);
6231
      return true;
6232
    }
6233
    return false;
6234
  };
6235

6236
  typedef AArch64MachineCombinerPattern MCP;
6237

6238
  switch (Root.getOpcode()) {
6239
  default:
6240
    assert(false && "Unsupported FP instruction in combiner\n");
6241
    break;
6242
  case AArch64::FADDHrr:
6243
    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6244
           "FADDHrr does not have register operands");
6245

6246
    Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6247
    Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6248
    break;
6249
  case AArch64::FADDSrr:
6250
    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6251
           "FADDSrr does not have register operands");
6252

6253
    Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6254
             Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6255

6256
    Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6257
             Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6258
    break;
6259
  case AArch64::FADDDrr:
6260
    Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6261
             Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6262

6263
    Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6264
             Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6265
    break;
6266
  case AArch64::FADDv4f16:
6267
    Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6268
             Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6269

6270
    Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6271
             Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6272
    break;
6273
  case AArch64::FADDv8f16:
6274
    Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6275
             Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6276

6277
    Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6278
             Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6279
    break;
6280
  case AArch64::FADDv2f32:
6281
    Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6282
             Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6283

6284
    Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6285
             Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6286
    break;
6287
  case AArch64::FADDv2f64:
6288
    Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6289
             Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6290

6291
    Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6292
             Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6293
    break;
6294
  case AArch64::FADDv4f32:
6295
    Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6296
             Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6297

6298
    Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6299
             Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6300
    break;
6301
  case AArch64::FSUBHrr:
6302
    Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6303
    Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6304
    Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6305
    break;
6306
  case AArch64::FSUBSrr:
6307
    Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6308

6309
    Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6310
             Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6311

6312
    Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6313
    break;
6314
  case AArch64::FSUBDrr:
6315
    Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6316

6317
    Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6318
             Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6319

6320
    Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6321
    break;
6322
  case AArch64::FSUBv4f16:
6323
    Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6324
             Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6325

6326
    Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6327
             Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6328
    break;
6329
  case AArch64::FSUBv8f16:
6330
    Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6331
             Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6332

6333
    Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6334
             Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6335
    break;
6336
  case AArch64::FSUBv2f32:
6337
    Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6338
             Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6339

6340
    Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6341
             Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6342
    break;
6343
  case AArch64::FSUBv2f64:
6344
    Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6345
             Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6346

6347
    Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6348
             Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6349
    break;
6350
  case AArch64::FSUBv4f32:
6351
    Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6352
             Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6353

6354
    Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6355
             Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6356
    break;
6357
  }
6358
  return Found;
6359
}
6360

6361
static bool getFMULPatterns(MachineInstr &Root,
6362
                            SmallVectorImpl<unsigned> &Patterns) {
6363
  MachineBasicBlock &MBB = *Root.getParent();
6364
  bool Found = false;
6365

6366
  auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6367
    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6368
    MachineOperand &MO = Root.getOperand(Operand);
6369
    MachineInstr *MI = nullptr;
6370
    if (MO.isReg() && MO.getReg().isVirtual())
6371
      MI = MRI.getUniqueVRegDef(MO.getReg());
6372
    // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6373
    if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6374
        MI->getOperand(1).getReg().isVirtual())
6375
      MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6376
    if (MI && MI->getOpcode() == Opcode) {
6377
      Patterns.push_back(Pattern);
6378
      return true;
6379
    }
6380
    return false;
6381
  };
6382

6383
  typedef AArch64MachineCombinerPattern MCP;
6384

6385
  switch (Root.getOpcode()) {
6386
  default:
6387
    return false;
6388
  case AArch64::FMULv2f32:
6389
    Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6390
    Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6391
    break;
6392
  case AArch64::FMULv2f64:
6393
    Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6394
    Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6395
    break;
6396
  case AArch64::FMULv4f16:
6397
    Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6398
    Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6399
    break;
6400
  case AArch64::FMULv4f32:
6401
    Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6402
    Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6403
    break;
6404
  case AArch64::FMULv8f16:
6405
    Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6406
    Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6407
    break;
6408
  }
6409

6410
  return Found;
6411
}
6412

6413
static bool getFNEGPatterns(MachineInstr &Root,
6414
                            SmallVectorImpl<unsigned> &Patterns) {
6415
  unsigned Opc = Root.getOpcode();
6416
  MachineBasicBlock &MBB = *Root.getParent();
6417
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6418

6419
  auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6420
    MachineOperand &MO = Root.getOperand(1);
6421
    MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6422
    if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6423
        MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6424
        Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6425
        Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6426
        MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6427
        MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6428
      Patterns.push_back(Pattern);
6429
      return true;
6430
    }
6431
    return false;
6432
  };
6433

6434
  switch (Opc) {
6435
  default:
6436
    break;
6437
  case AArch64::FNEGDr:
6438
    return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6439
  case AArch64::FNEGSr:
6440
    return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6441
  }
6442

6443
  return false;
6444
}
6445

6446
/// Return true when a code sequence can improve throughput. It
6447
/// should be called only for instructions in loops.
6448
/// \param Pattern - combiner pattern
6449
bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6450
  switch (Pattern) {
6451
  default:
6452
    break;
6453
  case AArch64MachineCombinerPattern::FMULADDH_OP1:
6454
  case AArch64MachineCombinerPattern::FMULADDH_OP2:
6455
  case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6456
  case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6457
  case AArch64MachineCombinerPattern::FMULADDS_OP1:
6458
  case AArch64MachineCombinerPattern::FMULADDS_OP2:
6459
  case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6460
  case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6461
  case AArch64MachineCombinerPattern::FMULADDD_OP1:
6462
  case AArch64MachineCombinerPattern::FMULADDD_OP2:
6463
  case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6464
  case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6465
  case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6466
  case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6467
  case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6468
  case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6469
  case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6470
  case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6471
  case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6472
  case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6473
  case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6474
  case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6475
  case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6476
  case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6477
  case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6478
  case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6479
  case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6480
  case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6481
  case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6482
  case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6483
  case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6484
  case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6485
  case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6486
  case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6487
  case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6488
  case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6489
  case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6490
  case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6491
  case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6492
  case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6493
  case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6494
  case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6495
  case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6496
  case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6497
  case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6498
  case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6499
  case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6500
  case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6501
  case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6502
  case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6503
  case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6504
  case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6505
  case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6506
  case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6507
  case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6508
  case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6509
  case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6510
  case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6511
  case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6512
  case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6513
  case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6514
  case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6515
  case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6516
  case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6517
  case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6518
  case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6519
  case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6520
  case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6521
  case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6522
  case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6523
  case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6524
  case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6525
  case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6526
  case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6527
  case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6528
  case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6529
  case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6530
  case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6531
  case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6532
  case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6533
  case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6534
  case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6535
  case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6536
  case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6537
  case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6538
  case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6539
  case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6540
  case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6541
  case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6542
  case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6543
  case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6544
  case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6545
  case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6546
  case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6547
  case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6548
  case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6549
  case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6550
  case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6551
  case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6552
  case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6553
  case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6554
  case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6555
  case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6556
  case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6557
  case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6558
    return true;
6559
  } // end switch (Pattern)
6560
  return false;
6561
}
6562

6563
/// Find other MI combine patterns.
6564
static bool getMiscPatterns(MachineInstr &Root,
6565
                            SmallVectorImpl<unsigned> &Patterns) {
6566
  // A - (B + C)  ==>   (A - B) - C  or  (A - C) - B
6567
  unsigned Opc = Root.getOpcode();
6568
  MachineBasicBlock &MBB = *Root.getParent();
6569

6570
  switch (Opc) {
6571
  case AArch64::SUBWrr:
6572
  case AArch64::SUBSWrr:
6573
  case AArch64::SUBXrr:
6574
  case AArch64::SUBSXrr:
6575
    // Found candidate root.
6576
    break;
6577
  default:
6578
    return false;
6579
  }
6580

6581
  if (isCombineInstrSettingFlag(Opc) &&
6582
      Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
6583
          -1)
6584
    return false;
6585

6586
  if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6587
      canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6588
      canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6589
      canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6590
    Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
6591
    Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
6592
    return true;
6593
  }
6594

6595
  return false;
6596
}
6597

6598
CombinerObjective
6599
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
6600
  switch (Pattern) {
6601
  case AArch64MachineCombinerPattern::SUBADD_OP1:
6602
  case AArch64MachineCombinerPattern::SUBADD_OP2:
6603
    return CombinerObjective::MustReduceDepth;
6604
  default:
6605
    return TargetInstrInfo::getCombinerObjective(Pattern);
6606
  }
6607
}
6608

6609
/// Return true when there is potentially a faster code sequence for an
6610
/// instruction chain ending in \p Root. All potential patterns are listed in
6611
/// the \p Pattern vector. Pattern should be sorted in priority order since the
6612
/// pattern evaluator stops checking as soon as it finds a faster sequence.
6613

6614
bool AArch64InstrInfo::getMachineCombinerPatterns(
6615
    MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
6616
    bool DoRegPressureReduce) const {
6617
  // Integer patterns
6618
  if (getMaddPatterns(Root, Patterns))
6619
    return true;
6620
  // Floating point patterns
6621
  if (getFMULPatterns(Root, Patterns))
6622
    return true;
6623
  if (getFMAPatterns(Root, Patterns))
6624
    return true;
6625
  if (getFNEGPatterns(Root, Patterns))
6626
    return true;
6627

6628
  // Other patterns
6629
  if (getMiscPatterns(Root, Patterns))
6630
    return true;
6631

6632
  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6633
                                                     DoRegPressureReduce);
6634
}
6635

6636
enum class FMAInstKind { Default, Indexed, Accumulator };
6637
/// genFusedMultiply - Generate fused multiply instructions.
6638
/// This function supports both integer and floating point instructions.
6639
/// A typical example:
6640
///  F|MUL I=A,B,0
6641
///  F|ADD R,I,C
6642
///  ==> F|MADD R,A,B,C
6643
/// \param MF Containing MachineFunction
6644
/// \param MRI Register information
6645
/// \param TII Target information
6646
/// \param Root is the F|ADD instruction
6647
/// \param [out] InsInstrs is a vector of machine instructions and will
6648
/// contain the generated madd instruction
6649
/// \param IdxMulOpd is index of operand in Root that is the result of
6650
/// the F|MUL. In the example above IdxMulOpd is 1.
6651
/// \param MaddOpc the opcode fo the f|madd instruction
6652
/// \param RC Register class of operands
6653
/// \param kind of fma instruction (addressing mode) to be generated
6654
/// \param ReplacedAddend is the result register from the instruction
6655
/// replacing the non-combined operand, if any.
6656
static MachineInstr *
6657
genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6658
                 const TargetInstrInfo *TII, MachineInstr &Root,
6659
                 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6660
                 unsigned MaddOpc, const TargetRegisterClass *RC,
6661
                 FMAInstKind kind = FMAInstKind::Default,
6662
                 const Register *ReplacedAddend = nullptr) {
6663
  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6664

6665
  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6666
  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6667
  Register ResultReg = Root.getOperand(0).getReg();
6668
  Register SrcReg0 = MUL->getOperand(1).getReg();
6669
  bool Src0IsKill = MUL->getOperand(1).isKill();
6670
  Register SrcReg1 = MUL->getOperand(2).getReg();
6671
  bool Src1IsKill = MUL->getOperand(2).isKill();
6672

6673
  Register SrcReg2;
6674
  bool Src2IsKill;
6675
  if (ReplacedAddend) {
6676
    // If we just generated a new addend, we must be it's only use.
6677
    SrcReg2 = *ReplacedAddend;
6678
    Src2IsKill = true;
6679
  } else {
6680
    SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
6681
    Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
6682
  }
6683

6684
  if (ResultReg.isVirtual())
6685
    MRI.constrainRegClass(ResultReg, RC);
6686
  if (SrcReg0.isVirtual())
6687
    MRI.constrainRegClass(SrcReg0, RC);
6688
  if (SrcReg1.isVirtual())
6689
    MRI.constrainRegClass(SrcReg1, RC);
6690
  if (SrcReg2.isVirtual())
6691
    MRI.constrainRegClass(SrcReg2, RC);
6692

6693
  MachineInstrBuilder MIB;
6694
  if (kind == FMAInstKind::Default)
6695
    MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6696
              .addReg(SrcReg0, getKillRegState(Src0IsKill))
6697
              .addReg(SrcReg1, getKillRegState(Src1IsKill))
6698
              .addReg(SrcReg2, getKillRegState(Src2IsKill));
6699
  else if (kind == FMAInstKind::Indexed)
6700
    MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6701
              .addReg(SrcReg2, getKillRegState(Src2IsKill))
6702
              .addReg(SrcReg0, getKillRegState(Src0IsKill))
6703
              .addReg(SrcReg1, getKillRegState(Src1IsKill))
6704
              .addImm(MUL->getOperand(3).getImm());
6705
  else if (kind == FMAInstKind::Accumulator)
6706
    MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6707
              .addReg(SrcReg2, getKillRegState(Src2IsKill))
6708
              .addReg(SrcReg0, getKillRegState(Src0IsKill))
6709
              .addReg(SrcReg1, getKillRegState(Src1IsKill));
6710
  else
6711
    assert(false && "Invalid FMA instruction kind \n");
6712
  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6713
  InsInstrs.push_back(MIB);
6714
  return MUL;
6715
}
6716

6717
static MachineInstr *
6718
genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6719
               const TargetInstrInfo *TII, MachineInstr &Root,
6720
               SmallVectorImpl<MachineInstr *> &InsInstrs) {
6721
  MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
6722

6723
  unsigned Opc = 0;
6724
  const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
6725
  if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6726
    Opc = AArch64::FNMADDSrrr;
6727
  else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6728
    Opc = AArch64::FNMADDDrrr;
6729
  else
6730
    return nullptr;
6731

6732
  Register ResultReg = Root.getOperand(0).getReg();
6733
  Register SrcReg0 = MAD->getOperand(1).getReg();
6734
  Register SrcReg1 = MAD->getOperand(2).getReg();
6735
  Register SrcReg2 = MAD->getOperand(3).getReg();
6736
  bool Src0IsKill = MAD->getOperand(1).isKill();
6737
  bool Src1IsKill = MAD->getOperand(2).isKill();
6738
  bool Src2IsKill = MAD->getOperand(3).isKill();
6739
  if (ResultReg.isVirtual())
6740
    MRI.constrainRegClass(ResultReg, RC);
6741
  if (SrcReg0.isVirtual())
6742
    MRI.constrainRegClass(SrcReg0, RC);
6743
  if (SrcReg1.isVirtual())
6744
    MRI.constrainRegClass(SrcReg1, RC);
6745
  if (SrcReg2.isVirtual())
6746
    MRI.constrainRegClass(SrcReg2, RC);
6747

6748
  MachineInstrBuilder MIB =
6749
      BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
6750
          .addReg(SrcReg0, getKillRegState(Src0IsKill))
6751
          .addReg(SrcReg1, getKillRegState(Src1IsKill))
6752
          .addReg(SrcReg2, getKillRegState(Src2IsKill));
6753
  InsInstrs.push_back(MIB);
6754

6755
  return MAD;
6756
}
6757

6758
/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6759
static MachineInstr *
6760
genIndexedMultiply(MachineInstr &Root,
6761
                   SmallVectorImpl<MachineInstr *> &InsInstrs,
6762
                   unsigned IdxDupOp, unsigned MulOpc,
6763
                   const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6764
  assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6765
         "Invalid index of FMUL operand");
6766

6767
  MachineFunction &MF = *Root.getMF();
6768
  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6769

6770
  MachineInstr *Dup =
6771
      MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
6772

6773
  if (Dup->getOpcode() == TargetOpcode::COPY)
6774
    Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
6775

6776
  Register DupSrcReg = Dup->getOperand(1).getReg();
6777
  MRI.clearKillFlags(DupSrcReg);
6778
  MRI.constrainRegClass(DupSrcReg, RC);
6779

6780
  unsigned DupSrcLane = Dup->getOperand(2).getImm();
6781

6782
  unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6783
  MachineOperand &MulOp = Root.getOperand(IdxMulOp);
6784

6785
  Register ResultReg = Root.getOperand(0).getReg();
6786

6787
  MachineInstrBuilder MIB;
6788
  MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
6789
            .add(MulOp)
6790
            .addReg(DupSrcReg)
6791
            .addImm(DupSrcLane);
6792

6793
  InsInstrs.push_back(MIB);
6794
  return &Root;
6795
}
6796

6797
/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6798
/// instructions.
6799
///
6800
/// \see genFusedMultiply
6801
static MachineInstr *genFusedMultiplyAcc(
6802
    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6803
    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6804
    unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6805
  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6806
                          FMAInstKind::Accumulator);
6807
}
6808

6809
/// genNeg - Helper to generate an intermediate negation of the second operand
6810
/// of Root
6811
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6812
                       const TargetInstrInfo *TII, MachineInstr &Root,
6813
                       SmallVectorImpl<MachineInstr *> &InsInstrs,
6814
                       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6815
                       unsigned MnegOpc, const TargetRegisterClass *RC) {
6816
  Register NewVR = MRI.createVirtualRegister(RC);
6817
  MachineInstrBuilder MIB =
6818
      BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
6819
          .add(Root.getOperand(2));
6820
  InsInstrs.push_back(MIB);
6821

6822
  assert(InstrIdxForVirtReg.empty());
6823
  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6824

6825
  return NewVR;
6826
}
6827

6828
/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6829
/// instructions with an additional negation of the accumulator
6830
static MachineInstr *genFusedMultiplyAccNeg(
6831
    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6832
    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6833
    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6834
    unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6835
  assert(IdxMulOpd == 1);
6836

6837
  Register NewVR =
6838
      genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6839
  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6840
                          FMAInstKind::Accumulator, &NewVR);
6841
}
6842

6843
/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6844
/// instructions.
6845
///
6846
/// \see genFusedMultiply
6847
static MachineInstr *genFusedMultiplyIdx(
6848
    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6849
    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6850
    unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6851
  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6852
                          FMAInstKind::Indexed);
6853
}
6854

6855
/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6856
/// instructions with an additional negation of the accumulator
6857
static MachineInstr *genFusedMultiplyIdxNeg(
6858
    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6859
    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6860
    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6861
    unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6862
  assert(IdxMulOpd == 1);
6863

6864
  Register NewVR =
6865
      genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6866

6867
  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6868
                          FMAInstKind::Indexed, &NewVR);
6869
}
6870

6871
/// genMaddR - Generate madd instruction and combine mul and add using
6872
/// an extra virtual register
6873
/// Example - an ADD intermediate needs to be stored in a register:
6874
///   MUL I=A,B,0
6875
///   ADD R,I,Imm
6876
///   ==> ORR  V, ZR, Imm
6877
///   ==> MADD R,A,B,V
6878
/// \param MF Containing MachineFunction
6879
/// \param MRI Register information
6880
/// \param TII Target information
6881
/// \param Root is the ADD instruction
6882
/// \param [out] InsInstrs is a vector of machine instructions and will
6883
/// contain the generated madd instruction
6884
/// \param IdxMulOpd is index of operand in Root that is the result of
6885
/// the MUL. In the example above IdxMulOpd is 1.
6886
/// \param MaddOpc the opcode fo the madd instruction
6887
/// \param VR is a virtual register that holds the value of an ADD operand
6888
/// (V in the example above).
6889
/// \param RC Register class of operands
6890
static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6891
                              const TargetInstrInfo *TII, MachineInstr &Root,
6892
                              SmallVectorImpl<MachineInstr *> &InsInstrs,
6893
                              unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6894
                              const TargetRegisterClass *RC) {
6895
  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6896

6897
  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6898
  Register ResultReg = Root.getOperand(0).getReg();
6899
  Register SrcReg0 = MUL->getOperand(1).getReg();
6900
  bool Src0IsKill = MUL->getOperand(1).isKill();
6901
  Register SrcReg1 = MUL->getOperand(2).getReg();
6902
  bool Src1IsKill = MUL->getOperand(2).isKill();
6903

6904
  if (ResultReg.isVirtual())
6905
    MRI.constrainRegClass(ResultReg, RC);
6906
  if (SrcReg0.isVirtual())
6907
    MRI.constrainRegClass(SrcReg0, RC);
6908
  if (SrcReg1.isVirtual())
6909
    MRI.constrainRegClass(SrcReg1, RC);
6910
  if (Register::isVirtualRegister(VR))
6911
    MRI.constrainRegClass(VR, RC);
6912

6913
  MachineInstrBuilder MIB =
6914
      BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6915
          .addReg(SrcReg0, getKillRegState(Src0IsKill))
6916
          .addReg(SrcReg1, getKillRegState(Src1IsKill))
6917
          .addReg(VR);
6918
  // Insert the MADD
6919
  InsInstrs.push_back(MIB);
6920
  return MUL;
6921
}
6922

6923
/// Do the following transformation
6924
/// A - (B + C)  ==>   (A - B) - C
6925
/// A - (B + C)  ==>   (A - C) - B
6926
static void
6927
genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6928
                 const TargetInstrInfo *TII, MachineInstr &Root,
6929
                 SmallVectorImpl<MachineInstr *> &InsInstrs,
6930
                 SmallVectorImpl<MachineInstr *> &DelInstrs,
6931
                 unsigned IdxOpd1,
6932
                 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6933
  assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6934
  unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6935
  MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
6936

6937
  Register ResultReg = Root.getOperand(0).getReg();
6938
  Register RegA = Root.getOperand(1).getReg();
6939
  bool RegAIsKill = Root.getOperand(1).isKill();
6940
  Register RegB = AddMI->getOperand(IdxOpd1).getReg();
6941
  bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
6942
  Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
6943
  bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
6944
  Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
6945

6946
  unsigned Opcode = Root.getOpcode();
6947
  if (Opcode == AArch64::SUBSWrr)
6948
    Opcode = AArch64::SUBWrr;
6949
  else if (Opcode == AArch64::SUBSXrr)
6950
    Opcode = AArch64::SUBXrr;
6951
  else
6952
    assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6953
           "Unexpected instruction opcode.");
6954

6955
  uint32_t Flags = Root.mergeFlagsWith(*AddMI);
6956
  Flags &= ~MachineInstr::NoSWrap;
6957
  Flags &= ~MachineInstr::NoUWrap;
6958

6959
  MachineInstrBuilder MIB1 =
6960
      BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
6961
          .addReg(RegA, getKillRegState(RegAIsKill))
6962
          .addReg(RegB, getKillRegState(RegBIsKill))
6963
          .setMIFlags(Flags);
6964
  MachineInstrBuilder MIB2 =
6965
      BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
6966
          .addReg(NewVR, getKillRegState(true))
6967
          .addReg(RegC, getKillRegState(RegCIsKill))
6968
          .setMIFlags(Flags);
6969

6970
  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6971
  InsInstrs.push_back(MIB1);
6972
  InsInstrs.push_back(MIB2);
6973
  DelInstrs.push_back(AddMI);
6974
  DelInstrs.push_back(&Root);
6975
}
6976

6977
/// When getMachineCombinerPatterns() finds potential patterns,
6978
/// this function generates the instructions that could replace the
6979
/// original code sequence
6980
void AArch64InstrInfo::genAlternativeCodeSequence(
6981
    MachineInstr &Root, unsigned Pattern,
6982
    SmallVectorImpl<MachineInstr *> &InsInstrs,
6983
    SmallVectorImpl<MachineInstr *> &DelInstrs,
6984
    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6985
  MachineBasicBlock &MBB = *Root.getParent();
6986
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6987
  MachineFunction &MF = *MBB.getParent();
6988
  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6989

6990
  MachineInstr *MUL = nullptr;
6991
  const TargetRegisterClass *RC;
6992
  unsigned Opc;
6993
  switch (Pattern) {
6994
  default:
6995
    // Reassociate instructions.
6996
    TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6997
                                                DelInstrs, InstrIdxForVirtReg);
6998
    return;
6999
  case AArch64MachineCombinerPattern::SUBADD_OP1:
7000
    // A - (B + C)
7001
    // ==> (A - B) - C
7002
    genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7003
                     InstrIdxForVirtReg);
7004
    return;
7005
  case AArch64MachineCombinerPattern::SUBADD_OP2:
7006
    // A - (B + C)
7007
    // ==> (A - C) - B
7008
    genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7009
                     InstrIdxForVirtReg);
7010
    return;
7011
  case AArch64MachineCombinerPattern::MULADDW_OP1:
7012
  case AArch64MachineCombinerPattern::MULADDX_OP1:
7013
    // MUL I=A,B,0
7014
    // ADD R,I,C
7015
    // ==> MADD R,A,B,C
7016
    // --- Create(MADD);
7017
    if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7018
      Opc = AArch64::MADDWrrr;
7019
      RC = &AArch64::GPR32RegClass;
7020
    } else {
7021
      Opc = AArch64::MADDXrrr;
7022
      RC = &AArch64::GPR64RegClass;
7023
    }
7024
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7025
    break;
7026
  case AArch64MachineCombinerPattern::MULADDW_OP2:
7027
  case AArch64MachineCombinerPattern::MULADDX_OP2:
7028
    // MUL I=A,B,0
7029
    // ADD R,C,I
7030
    // ==> MADD R,A,B,C
7031
    // --- Create(MADD);
7032
    if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7033
      Opc = AArch64::MADDWrrr;
7034
      RC = &AArch64::GPR32RegClass;
7035
    } else {
7036
      Opc = AArch64::MADDXrrr;
7037
      RC = &AArch64::GPR64RegClass;
7038
    }
7039
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7040
    break;
7041
  case AArch64MachineCombinerPattern::MULADDWI_OP1:
7042
  case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7043
    // MUL I=A,B,0
7044
    // ADD R,I,Imm
7045
    // ==> MOV V, Imm
7046
    // ==> MADD R,A,B,V
7047
    // --- Create(MADD);
7048
    const TargetRegisterClass *OrrRC;
7049
    unsigned BitSize, OrrOpc, ZeroReg;
7050
    if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7051
      OrrOpc = AArch64::ORRWri;
7052
      OrrRC = &AArch64::GPR32spRegClass;
7053
      BitSize = 32;
7054
      ZeroReg = AArch64::WZR;
7055
      Opc = AArch64::MADDWrrr;
7056
      RC = &AArch64::GPR32RegClass;
7057
    } else {
7058
      OrrOpc = AArch64::ORRXri;
7059
      OrrRC = &AArch64::GPR64spRegClass;
7060
      BitSize = 64;
7061
      ZeroReg = AArch64::XZR;
7062
      Opc = AArch64::MADDXrrr;
7063
      RC = &AArch64::GPR64RegClass;
7064
    }
7065
    Register NewVR = MRI.createVirtualRegister(OrrRC);
7066
    uint64_t Imm = Root.getOperand(2).getImm();
7067

7068
    if (Root.getOperand(3).isImm()) {
7069
      unsigned Val = Root.getOperand(3).getImm();
7070
      Imm = Imm << Val;
7071
    }
7072
    uint64_t UImm = SignExtend64(Imm, BitSize);
7073
    // The immediate can be composed via a single instruction.
7074
    SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7075
    AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7076
    if (Insn.size() != 1)
7077
      return;
7078
    auto MovI = Insn.begin();
7079
    MachineInstrBuilder MIB1;
7080
    // MOV is an alias for one of three instructions: movz, movn, and orr.
7081
    if (MovI->Opcode == OrrOpc)
7082
      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7083
                 .addReg(ZeroReg)
7084
                 .addImm(MovI->Op2);
7085
    else {
7086
      if (BitSize == 32)
7087
        assert((MovI->Opcode == AArch64::MOVNWi ||
7088
                MovI->Opcode == AArch64::MOVZWi) &&
7089
               "Expected opcode");
7090
      else
7091
        assert((MovI->Opcode == AArch64::MOVNXi ||
7092
                MovI->Opcode == AArch64::MOVZXi) &&
7093
               "Expected opcode");
7094
      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7095
                 .addImm(MovI->Op1)
7096
                 .addImm(MovI->Op2);
7097
    }
7098
    InsInstrs.push_back(MIB1);
7099
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7100
    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7101
    break;
7102
  }
7103
  case AArch64MachineCombinerPattern::MULSUBW_OP1:
7104
  case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7105
    // MUL I=A,B,0
7106
    // SUB R,I, C
7107
    // ==> SUB  V, 0, C
7108
    // ==> MADD R,A,B,V // = -C + A*B
7109
    // --- Create(MADD);
7110
    const TargetRegisterClass *SubRC;
7111
    unsigned SubOpc, ZeroReg;
7112
    if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7113
      SubOpc = AArch64::SUBWrr;
7114
      SubRC = &AArch64::GPR32spRegClass;
7115
      ZeroReg = AArch64::WZR;
7116
      Opc = AArch64::MADDWrrr;
7117
      RC = &AArch64::GPR32RegClass;
7118
    } else {
7119
      SubOpc = AArch64::SUBXrr;
7120
      SubRC = &AArch64::GPR64spRegClass;
7121
      ZeroReg = AArch64::XZR;
7122
      Opc = AArch64::MADDXrrr;
7123
      RC = &AArch64::GPR64RegClass;
7124
    }
7125
    Register NewVR = MRI.createVirtualRegister(SubRC);
7126
    // SUB NewVR, 0, C
7127
    MachineInstrBuilder MIB1 =
7128
        BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7129
            .addReg(ZeroReg)
7130
            .add(Root.getOperand(2));
7131
    InsInstrs.push_back(MIB1);
7132
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7133
    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7134
    break;
7135
  }
7136
  case AArch64MachineCombinerPattern::MULSUBW_OP2:
7137
  case AArch64MachineCombinerPattern::MULSUBX_OP2:
7138
    // MUL I=A,B,0
7139
    // SUB R,C,I
7140
    // ==> MSUB R,A,B,C (computes C - A*B)
7141
    // --- Create(MSUB);
7142
    if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7143
      Opc = AArch64::MSUBWrrr;
7144
      RC = &AArch64::GPR32RegClass;
7145
    } else {
7146
      Opc = AArch64::MSUBXrrr;
7147
      RC = &AArch64::GPR64RegClass;
7148
    }
7149
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7150
    break;
7151
  case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7152
  case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7153
    // MUL I=A,B,0
7154
    // SUB R,I, Imm
7155
    // ==> MOV  V, -Imm
7156
    // ==> MADD R,A,B,V // = -Imm + A*B
7157
    // --- Create(MADD);
7158
    const TargetRegisterClass *OrrRC;
7159
    unsigned BitSize, OrrOpc, ZeroReg;
7160
    if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7161
      OrrOpc = AArch64::ORRWri;
7162
      OrrRC = &AArch64::GPR32spRegClass;
7163
      BitSize = 32;
7164
      ZeroReg = AArch64::WZR;
7165
      Opc = AArch64::MADDWrrr;
7166
      RC = &AArch64::GPR32RegClass;
7167
    } else {
7168
      OrrOpc = AArch64::ORRXri;
7169
      OrrRC = &AArch64::GPR64spRegClass;
7170
      BitSize = 64;
7171
      ZeroReg = AArch64::XZR;
7172
      Opc = AArch64::MADDXrrr;
7173
      RC = &AArch64::GPR64RegClass;
7174
    }
7175
    Register NewVR = MRI.createVirtualRegister(OrrRC);
7176
    uint64_t Imm = Root.getOperand(2).getImm();
7177
    if (Root.getOperand(3).isImm()) {
7178
      unsigned Val = Root.getOperand(3).getImm();
7179
      Imm = Imm << Val;
7180
    }
7181
    uint64_t UImm = SignExtend64(-Imm, BitSize);
7182
    // The immediate can be composed via a single instruction.
7183
    SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7184
    AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7185
    if (Insn.size() != 1)
7186
      return;
7187
    auto MovI = Insn.begin();
7188
    MachineInstrBuilder MIB1;
7189
    // MOV is an alias for one of three instructions: movz, movn, and orr.
7190
    if (MovI->Opcode == OrrOpc)
7191
      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7192
                 .addReg(ZeroReg)
7193
                 .addImm(MovI->Op2);
7194
    else {
7195
      if (BitSize == 32)
7196
        assert((MovI->Opcode == AArch64::MOVNWi ||
7197
                MovI->Opcode == AArch64::MOVZWi) &&
7198
               "Expected opcode");
7199
      else
7200
        assert((MovI->Opcode == AArch64::MOVNXi ||
7201
                MovI->Opcode == AArch64::MOVZXi) &&
7202
               "Expected opcode");
7203
      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7204
                 .addImm(MovI->Op1)
7205
                 .addImm(MovI->Op2);
7206
    }
7207
    InsInstrs.push_back(MIB1);
7208
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7209
    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7210
    break;
7211
  }
7212

7213
  case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7214
    Opc = AArch64::MLAv8i8;
7215
    RC = &AArch64::FPR64RegClass;
7216
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7217
    break;
7218
  case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7219
    Opc = AArch64::MLAv8i8;
7220
    RC = &AArch64::FPR64RegClass;
7221
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7222
    break;
7223
  case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7224
    Opc = AArch64::MLAv16i8;
7225
    RC = &AArch64::FPR128RegClass;
7226
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7227
    break;
7228
  case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7229
    Opc = AArch64::MLAv16i8;
7230
    RC = &AArch64::FPR128RegClass;
7231
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7232
    break;
7233
  case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7234
    Opc = AArch64::MLAv4i16;
7235
    RC = &AArch64::FPR64RegClass;
7236
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7237
    break;
7238
  case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7239
    Opc = AArch64::MLAv4i16;
7240
    RC = &AArch64::FPR64RegClass;
7241
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7242
    break;
7243
  case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7244
    Opc = AArch64::MLAv8i16;
7245
    RC = &AArch64::FPR128RegClass;
7246
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7247
    break;
7248
  case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7249
    Opc = AArch64::MLAv8i16;
7250
    RC = &AArch64::FPR128RegClass;
7251
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7252
    break;
7253
  case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7254
    Opc = AArch64::MLAv2i32;
7255
    RC = &AArch64::FPR64RegClass;
7256
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7257
    break;
7258
  case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7259
    Opc = AArch64::MLAv2i32;
7260
    RC = &AArch64::FPR64RegClass;
7261
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7262
    break;
7263
  case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7264
    Opc = AArch64::MLAv4i32;
7265
    RC = &AArch64::FPR128RegClass;
7266
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7267
    break;
7268
  case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7269
    Opc = AArch64::MLAv4i32;
7270
    RC = &AArch64::FPR128RegClass;
7271
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7272
    break;
7273

7274
  case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7275
    Opc = AArch64::MLAv8i8;
7276
    RC = &AArch64::FPR64RegClass;
7277
    MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7278
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7279
                                 RC);
7280
    break;
7281
  case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7282
    Opc = AArch64::MLSv8i8;
7283
    RC = &AArch64::FPR64RegClass;
7284
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7285
    break;
7286
  case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7287
    Opc = AArch64::MLAv16i8;
7288
    RC = &AArch64::FPR128RegClass;
7289
    MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7290
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7291
                                 RC);
7292
    break;
7293
  case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7294
    Opc = AArch64::MLSv16i8;
7295
    RC = &AArch64::FPR128RegClass;
7296
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7297
    break;
7298
  case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7299
    Opc = AArch64::MLAv4i16;
7300
    RC = &AArch64::FPR64RegClass;
7301
    MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7302
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7303
                                 RC);
7304
    break;
7305
  case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7306
    Opc = AArch64::MLSv4i16;
7307
    RC = &AArch64::FPR64RegClass;
7308
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7309
    break;
7310
  case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7311
    Opc = AArch64::MLAv8i16;
7312
    RC = &AArch64::FPR128RegClass;
7313
    MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7314
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7315
                                 RC);
7316
    break;
7317
  case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7318
    Opc = AArch64::MLSv8i16;
7319
    RC = &AArch64::FPR128RegClass;
7320
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7321
    break;
7322
  case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7323
    Opc = AArch64::MLAv2i32;
7324
    RC = &AArch64::FPR64RegClass;
7325
    MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7326
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7327
                                 RC);
7328
    break;
7329
  case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7330
    Opc = AArch64::MLSv2i32;
7331
    RC = &AArch64::FPR64RegClass;
7332
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7333
    break;
7334
  case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7335
    Opc = AArch64::MLAv4i32;
7336
    RC = &AArch64::FPR128RegClass;
7337
    MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7338
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7339
                                 RC);
7340
    break;
7341
  case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7342
    Opc = AArch64::MLSv4i32;
7343
    RC = &AArch64::FPR128RegClass;
7344
    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7345
    break;
7346

7347
  case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7348
    Opc = AArch64::MLAv4i16_indexed;
7349
    RC = &AArch64::FPR64RegClass;
7350
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7351
    break;
7352
  case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7353
    Opc = AArch64::MLAv4i16_indexed;
7354
    RC = &AArch64::FPR64RegClass;
7355
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7356
    break;
7357
  case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7358
    Opc = AArch64::MLAv8i16_indexed;
7359
    RC = &AArch64::FPR128RegClass;
7360
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7361
    break;
7362
  case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7363
    Opc = AArch64::MLAv8i16_indexed;
7364
    RC = &AArch64::FPR128RegClass;
7365
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7366
    break;
7367
  case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7368
    Opc = AArch64::MLAv2i32_indexed;
7369
    RC = &AArch64::FPR64RegClass;
7370
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7371
    break;
7372
  case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7373
    Opc = AArch64::MLAv2i32_indexed;
7374
    RC = &AArch64::FPR64RegClass;
7375
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7376
    break;
7377
  case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7378
    Opc = AArch64::MLAv4i32_indexed;
7379
    RC = &AArch64::FPR128RegClass;
7380
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7381
    break;
7382
  case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7383
    Opc = AArch64::MLAv4i32_indexed;
7384
    RC = &AArch64::FPR128RegClass;
7385
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7386
    break;
7387

7388
  case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7389
    Opc = AArch64::MLAv4i16_indexed;
7390
    RC = &AArch64::FPR64RegClass;
7391
    MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7392
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7393
                                 RC);
7394
    break;
7395
  case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7396
    Opc = AArch64::MLSv4i16_indexed;
7397
    RC = &AArch64::FPR64RegClass;
7398
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7399
    break;
7400
  case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7401
    Opc = AArch64::MLAv8i16_indexed;
7402
    RC = &AArch64::FPR128RegClass;
7403
    MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7404
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7405
                                 RC);
7406
    break;
7407
  case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7408
    Opc = AArch64::MLSv8i16_indexed;
7409
    RC = &AArch64::FPR128RegClass;
7410
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7411
    break;
7412
  case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7413
    Opc = AArch64::MLAv2i32_indexed;
7414
    RC = &AArch64::FPR64RegClass;
7415
    MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7416
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7417
                                 RC);
7418
    break;
7419
  case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7420
    Opc = AArch64::MLSv2i32_indexed;
7421
    RC = &AArch64::FPR64RegClass;
7422
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7423
    break;
7424
  case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7425
    Opc = AArch64::MLAv4i32_indexed;
7426
    RC = &AArch64::FPR128RegClass;
7427
    MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7428
                                 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7429
                                 RC);
7430
    break;
7431
  case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7432
    Opc = AArch64::MLSv4i32_indexed;
7433
    RC = &AArch64::FPR128RegClass;
7434
    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7435
    break;
7436

7437
  // Floating Point Support
7438
  case AArch64MachineCombinerPattern::FMULADDH_OP1:
7439
    Opc = AArch64::FMADDHrrr;
7440
    RC = &AArch64::FPR16RegClass;
7441
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7442
    break;
7443
  case AArch64MachineCombinerPattern::FMULADDS_OP1:
7444
    Opc = AArch64::FMADDSrrr;
7445
    RC = &AArch64::FPR32RegClass;
7446
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7447
    break;
7448
  case AArch64MachineCombinerPattern::FMULADDD_OP1:
7449
    Opc = AArch64::FMADDDrrr;
7450
    RC = &AArch64::FPR64RegClass;
7451
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7452
    break;
7453

7454
  case AArch64MachineCombinerPattern::FMULADDH_OP2:
7455
    Opc = AArch64::FMADDHrrr;
7456
    RC = &AArch64::FPR16RegClass;
7457
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7458
    break;
7459
  case AArch64MachineCombinerPattern::FMULADDS_OP2:
7460
    Opc = AArch64::FMADDSrrr;
7461
    RC = &AArch64::FPR32RegClass;
7462
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7463
    break;
7464
  case AArch64MachineCombinerPattern::FMULADDD_OP2:
7465
    Opc = AArch64::FMADDDrrr;
7466
    RC = &AArch64::FPR64RegClass;
7467
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7468
    break;
7469

7470
  case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7471
    Opc = AArch64::FMLAv1i32_indexed;
7472
    RC = &AArch64::FPR32RegClass;
7473
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7474
                           FMAInstKind::Indexed);
7475
    break;
7476
  case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7477
    Opc = AArch64::FMLAv1i32_indexed;
7478
    RC = &AArch64::FPR32RegClass;
7479
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7480
                           FMAInstKind::Indexed);
7481
    break;
7482

7483
  case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7484
    Opc = AArch64::FMLAv1i64_indexed;
7485
    RC = &AArch64::FPR64RegClass;
7486
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7487
                           FMAInstKind::Indexed);
7488
    break;
7489
  case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7490
    Opc = AArch64::FMLAv1i64_indexed;
7491
    RC = &AArch64::FPR64RegClass;
7492
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7493
                           FMAInstKind::Indexed);
7494
    break;
7495

7496
  case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7497
    RC = &AArch64::FPR64RegClass;
7498
    Opc = AArch64::FMLAv4i16_indexed;
7499
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7500
                           FMAInstKind::Indexed);
7501
    break;
7502
  case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7503
    RC = &AArch64::FPR64RegClass;
7504
    Opc = AArch64::FMLAv4f16;
7505
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7506
                           FMAInstKind::Accumulator);
7507
    break;
7508
  case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7509
    RC = &AArch64::FPR64RegClass;
7510
    Opc = AArch64::FMLAv4i16_indexed;
7511
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7512
                           FMAInstKind::Indexed);
7513
    break;
7514
  case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7515
    RC = &AArch64::FPR64RegClass;
7516
    Opc = AArch64::FMLAv4f16;
7517
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7518
                           FMAInstKind::Accumulator);
7519
    break;
7520

7521
  case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7522
  case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7523
    RC = &AArch64::FPR64RegClass;
7524
    if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7525
      Opc = AArch64::FMLAv2i32_indexed;
7526
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7527
                             FMAInstKind::Indexed);
7528
    } else {
7529
      Opc = AArch64::FMLAv2f32;
7530
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7531
                             FMAInstKind::Accumulator);
7532
    }
7533
    break;
7534
  case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7535
  case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7536
    RC = &AArch64::FPR64RegClass;
7537
    if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7538
      Opc = AArch64::FMLAv2i32_indexed;
7539
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7540
                             FMAInstKind::Indexed);
7541
    } else {
7542
      Opc = AArch64::FMLAv2f32;
7543
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7544
                             FMAInstKind::Accumulator);
7545
    }
7546
    break;
7547

7548
  case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7549
    RC = &AArch64::FPR128RegClass;
7550
    Opc = AArch64::FMLAv8i16_indexed;
7551
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7552
                           FMAInstKind::Indexed);
7553
    break;
7554
  case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7555
    RC = &AArch64::FPR128RegClass;
7556
    Opc = AArch64::FMLAv8f16;
7557
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7558
                           FMAInstKind::Accumulator);
7559
    break;
7560
  case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7561
    RC = &AArch64::FPR128RegClass;
7562
    Opc = AArch64::FMLAv8i16_indexed;
7563
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7564
                           FMAInstKind::Indexed);
7565
    break;
7566
  case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7567
    RC = &AArch64::FPR128RegClass;
7568
    Opc = AArch64::FMLAv8f16;
7569
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7570
                           FMAInstKind::Accumulator);
7571
    break;
7572

7573
  case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7574
  case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7575
    RC = &AArch64::FPR128RegClass;
7576
    if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7577
      Opc = AArch64::FMLAv2i64_indexed;
7578
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7579
                             FMAInstKind::Indexed);
7580
    } else {
7581
      Opc = AArch64::FMLAv2f64;
7582
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7583
                             FMAInstKind::Accumulator);
7584
    }
7585
    break;
7586
  case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7587
  case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7588
    RC = &AArch64::FPR128RegClass;
7589
    if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7590
      Opc = AArch64::FMLAv2i64_indexed;
7591
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7592
                             FMAInstKind::Indexed);
7593
    } else {
7594
      Opc = AArch64::FMLAv2f64;
7595
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7596
                             FMAInstKind::Accumulator);
7597
    }
7598
    break;
7599

7600
  case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7601
  case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7602
    RC = &AArch64::FPR128RegClass;
7603
    if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7604
      Opc = AArch64::FMLAv4i32_indexed;
7605
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7606
                             FMAInstKind::Indexed);
7607
    } else {
7608
      Opc = AArch64::FMLAv4f32;
7609
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7610
                             FMAInstKind::Accumulator);
7611
    }
7612
    break;
7613

7614
  case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7615
  case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7616
    RC = &AArch64::FPR128RegClass;
7617
    if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7618
      Opc = AArch64::FMLAv4i32_indexed;
7619
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7620
                             FMAInstKind::Indexed);
7621
    } else {
7622
      Opc = AArch64::FMLAv4f32;
7623
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7624
                             FMAInstKind::Accumulator);
7625
    }
7626
    break;
7627

7628
  case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7629
    Opc = AArch64::FNMSUBHrrr;
7630
    RC = &AArch64::FPR16RegClass;
7631
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7632
    break;
7633
  case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7634
    Opc = AArch64::FNMSUBSrrr;
7635
    RC = &AArch64::FPR32RegClass;
7636
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7637
    break;
7638
  case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7639
    Opc = AArch64::FNMSUBDrrr;
7640
    RC = &AArch64::FPR64RegClass;
7641
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7642
    break;
7643

7644
  case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7645
    Opc = AArch64::FNMADDHrrr;
7646
    RC = &AArch64::FPR16RegClass;
7647
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7648
    break;
7649
  case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7650
    Opc = AArch64::FNMADDSrrr;
7651
    RC = &AArch64::FPR32RegClass;
7652
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7653
    break;
7654
  case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7655
    Opc = AArch64::FNMADDDrrr;
7656
    RC = &AArch64::FPR64RegClass;
7657
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7658
    break;
7659

7660
  case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7661
    Opc = AArch64::FMSUBHrrr;
7662
    RC = &AArch64::FPR16RegClass;
7663
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7664
    break;
7665
  case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7666
    Opc = AArch64::FMSUBSrrr;
7667
    RC = &AArch64::FPR32RegClass;
7668
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7669
    break;
7670
  case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7671
    Opc = AArch64::FMSUBDrrr;
7672
    RC = &AArch64::FPR64RegClass;
7673
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7674
    break;
7675

7676
  case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7677
    Opc = AArch64::FMLSv1i32_indexed;
7678
    RC = &AArch64::FPR32RegClass;
7679
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7680
                           FMAInstKind::Indexed);
7681
    break;
7682

7683
  case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7684
    Opc = AArch64::FMLSv1i64_indexed;
7685
    RC = &AArch64::FPR64RegClass;
7686
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7687
                           FMAInstKind::Indexed);
7688
    break;
7689

7690
  case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7691
  case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7692
    RC = &AArch64::FPR64RegClass;
7693
    Register NewVR = MRI.createVirtualRegister(RC);
7694
    MachineInstrBuilder MIB1 =
7695
        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7696
            .add(Root.getOperand(2));
7697
    InsInstrs.push_back(MIB1);
7698
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7699
    if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
7700
      Opc = AArch64::FMLAv4f16;
7701
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7702
                             FMAInstKind::Accumulator, &NewVR);
7703
    } else {
7704
      Opc = AArch64::FMLAv4i16_indexed;
7705
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7706
                             FMAInstKind::Indexed, &NewVR);
7707
    }
7708
    break;
7709
  }
7710
  case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7711
    RC = &AArch64::FPR64RegClass;
7712
    Opc = AArch64::FMLSv4f16;
7713
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7714
                           FMAInstKind::Accumulator);
7715
    break;
7716
  case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7717
    RC = &AArch64::FPR64RegClass;
7718
    Opc = AArch64::FMLSv4i16_indexed;
7719
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7720
                           FMAInstKind::Indexed);
7721
    break;
7722

7723
  case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7724
  case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7725
    RC = &AArch64::FPR64RegClass;
7726
    if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7727
      Opc = AArch64::FMLSv2i32_indexed;
7728
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7729
                             FMAInstKind::Indexed);
7730
    } else {
7731
      Opc = AArch64::FMLSv2f32;
7732
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7733
                             FMAInstKind::Accumulator);
7734
    }
7735
    break;
7736

7737
  case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7738
  case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7739
    RC = &AArch64::FPR128RegClass;
7740
    Register NewVR = MRI.createVirtualRegister(RC);
7741
    MachineInstrBuilder MIB1 =
7742
        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7743
            .add(Root.getOperand(2));
7744
    InsInstrs.push_back(MIB1);
7745
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7746
    if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
7747
      Opc = AArch64::FMLAv8f16;
7748
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7749
                             FMAInstKind::Accumulator, &NewVR);
7750
    } else {
7751
      Opc = AArch64::FMLAv8i16_indexed;
7752
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7753
                             FMAInstKind::Indexed, &NewVR);
7754
    }
7755
    break;
7756
  }
7757
  case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7758
    RC = &AArch64::FPR128RegClass;
7759
    Opc = AArch64::FMLSv8f16;
7760
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7761
                           FMAInstKind::Accumulator);
7762
    break;
7763
  case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7764
    RC = &AArch64::FPR128RegClass;
7765
    Opc = AArch64::FMLSv8i16_indexed;
7766
    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7767
                           FMAInstKind::Indexed);
7768
    break;
7769

7770
  case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7771
  case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7772
    RC = &AArch64::FPR128RegClass;
7773
    if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7774
      Opc = AArch64::FMLSv2i64_indexed;
7775
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7776
                             FMAInstKind::Indexed);
7777
    } else {
7778
      Opc = AArch64::FMLSv2f64;
7779
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7780
                             FMAInstKind::Accumulator);
7781
    }
7782
    break;
7783

7784
  case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7785
  case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7786
    RC = &AArch64::FPR128RegClass;
7787
    if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7788
      Opc = AArch64::FMLSv4i32_indexed;
7789
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7790
                             FMAInstKind::Indexed);
7791
    } else {
7792
      Opc = AArch64::FMLSv4f32;
7793
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7794
                             FMAInstKind::Accumulator);
7795
    }
7796
    break;
7797
  case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
7798
  case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7799
    RC = &AArch64::FPR64RegClass;
7800
    Register NewVR = MRI.createVirtualRegister(RC);
7801
    MachineInstrBuilder MIB1 =
7802
        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7803
            .add(Root.getOperand(2));
7804
    InsInstrs.push_back(MIB1);
7805
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7806
    if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7807
      Opc = AArch64::FMLAv2i32_indexed;
7808
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7809
                             FMAInstKind::Indexed, &NewVR);
7810
    } else {
7811
      Opc = AArch64::FMLAv2f32;
7812
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7813
                             FMAInstKind::Accumulator, &NewVR);
7814
    }
7815
    break;
7816
  }
7817
  case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
7818
  case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7819
    RC = &AArch64::FPR128RegClass;
7820
    Register NewVR = MRI.createVirtualRegister(RC);
7821
    MachineInstrBuilder MIB1 =
7822
        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7823
            .add(Root.getOperand(2));
7824
    InsInstrs.push_back(MIB1);
7825
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7826
    if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7827
      Opc = AArch64::FMLAv4i32_indexed;
7828
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7829
                             FMAInstKind::Indexed, &NewVR);
7830
    } else {
7831
      Opc = AArch64::FMLAv4f32;
7832
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7833
                             FMAInstKind::Accumulator, &NewVR);
7834
    }
7835
    break;
7836
  }
7837
  case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
7838
  case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7839
    RC = &AArch64::FPR128RegClass;
7840
    Register NewVR = MRI.createVirtualRegister(RC);
7841
    MachineInstrBuilder MIB1 =
7842
        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7843
            .add(Root.getOperand(2));
7844
    InsInstrs.push_back(MIB1);
7845
    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7846
    if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7847
      Opc = AArch64::FMLAv2i64_indexed;
7848
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7849
                             FMAInstKind::Indexed, &NewVR);
7850
    } else {
7851
      Opc = AArch64::FMLAv2f64;
7852
      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7853
                             FMAInstKind::Accumulator, &NewVR);
7854
    }
7855
    break;
7856
  }
7857
  case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7858
  case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7859
    unsigned IdxDupOp =
7860
        (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
7861
                                                                          : 2;
7862
    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7863
                       &AArch64::FPR128RegClass, MRI);
7864
    break;
7865
  }
7866
  case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7867
  case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7868
    unsigned IdxDupOp =
7869
        (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
7870
                                                                          : 2;
7871
    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7872
                       &AArch64::FPR128RegClass, MRI);
7873
    break;
7874
  }
7875
  case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7876
  case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7877
    unsigned IdxDupOp =
7878
        (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
7879
                                                                          : 2;
7880
    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7881
                       &AArch64::FPR128_loRegClass, MRI);
7882
    break;
7883
  }
7884
  case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7885
  case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7886
    unsigned IdxDupOp =
7887
        (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
7888
                                                                          : 2;
7889
    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7890
                       &AArch64::FPR128RegClass, MRI);
7891
    break;
7892
  }
7893
  case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7894
  case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7895
    unsigned IdxDupOp =
7896
        (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
7897
                                                                          : 2;
7898
    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7899
                       &AArch64::FPR128_loRegClass, MRI);
7900
    break;
7901
  }
7902
  case AArch64MachineCombinerPattern::FNMADD: {
7903
    MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7904
    break;
7905
  }
7906

7907
  } // end switch (Pattern)
7908
  // Record MUL and ADD/SUB for deletion
7909
  if (MUL)
7910
    DelInstrs.push_back(MUL);
7911
  DelInstrs.push_back(&Root);
7912

7913
  // Set the flags on the inserted instructions to be the merged flags of the
7914
  // instructions that we have combined.
7915
  uint32_t Flags = Root.getFlags();
7916
  if (MUL)
7917
    Flags = Root.mergeFlagsWith(*MUL);
7918
  for (auto *MI : InsInstrs)
7919
    MI->setFlags(Flags);
7920
}
7921

7922
/// Replace csincr-branch sequence by simple conditional branch
7923
///
7924
/// Examples:
7925
/// 1. \code
7926
///   csinc  w9, wzr, wzr, <condition code>
7927
///   tbnz   w9, #0, 0x44
7928
///    \endcode
7929
/// to
7930
///    \code
7931
///   b.<inverted condition code>
7932
///    \endcode
7933
///
7934
/// 2. \code
7935
///   csinc w9, wzr, wzr, <condition code>
7936
///   tbz   w9, #0, 0x44
7937
///    \endcode
7938
/// to
7939
///    \code
7940
///   b.<condition code>
7941
///    \endcode
7942
///
7943
/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7944
/// compare's constant operand is power of 2.
7945
///
7946
/// Examples:
7947
///    \code
7948
///   and  w8, w8, #0x400
7949
///   cbnz w8, L1
7950
///    \endcode
7951
/// to
7952
///    \code
7953
///   tbnz w8, #10, L1
7954
///    \endcode
7955
///
7956
/// \param  MI Conditional Branch
7957
/// \return True when the simple conditional branch is generated
7958
///
7959
bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7960
  bool IsNegativeBranch = false;
7961
  bool IsTestAndBranch = false;
7962
  unsigned TargetBBInMI = 0;
7963
  switch (MI.getOpcode()) {
7964
  default:
7965
    llvm_unreachable("Unknown branch instruction?");
7966
  case AArch64::Bcc:
7967
    return false;
7968
  case AArch64::CBZW:
7969
  case AArch64::CBZX:
7970
    TargetBBInMI = 1;
7971
    break;
7972
  case AArch64::CBNZW:
7973
  case AArch64::CBNZX:
7974
    TargetBBInMI = 1;
7975
    IsNegativeBranch = true;
7976
    break;
7977
  case AArch64::TBZW:
7978
  case AArch64::TBZX:
7979
    TargetBBInMI = 2;
7980
    IsTestAndBranch = true;
7981
    break;
7982
  case AArch64::TBNZW:
7983
  case AArch64::TBNZX:
7984
    TargetBBInMI = 2;
7985
    IsNegativeBranch = true;
7986
    IsTestAndBranch = true;
7987
    break;
7988
  }
7989
  // So we increment a zero register and test for bits other
7990
  // than bit 0? Conservatively bail out in case the verifier
7991
  // missed this case.
7992
  if (IsTestAndBranch && MI.getOperand(1).getImm())
7993
    return false;
7994

7995
  // Find Definition.
7996
  assert(MI.getParent() && "Incomplete machine instruciton\n");
7997
  MachineBasicBlock *MBB = MI.getParent();
7998
  MachineFunction *MF = MBB->getParent();
7999
  MachineRegisterInfo *MRI = &MF->getRegInfo();
8000
  Register VReg = MI.getOperand(0).getReg();
8001
  if (!VReg.isVirtual())
8002
    return false;
8003

8004
  MachineInstr *DefMI = MRI->getVRegDef(VReg);
8005

8006
  // Look through COPY instructions to find definition.
8007
  while (DefMI->isCopy()) {
8008
    Register CopyVReg = DefMI->getOperand(1).getReg();
8009
    if (!MRI->hasOneNonDBGUse(CopyVReg))
8010
      return false;
8011
    if (!MRI->hasOneDef(CopyVReg))
8012
      return false;
8013
    DefMI = MRI->getVRegDef(CopyVReg);
8014
  }
8015

8016
  switch (DefMI->getOpcode()) {
8017
  default:
8018
    return false;
8019
  // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8020
  case AArch64::ANDWri:
8021
  case AArch64::ANDXri: {
8022
    if (IsTestAndBranch)
8023
      return false;
8024
    if (DefMI->getParent() != MBB)
8025
      return false;
8026
    if (!MRI->hasOneNonDBGUse(VReg))
8027
      return false;
8028

8029
    bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8030
    uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8031
        DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8032
    if (!isPowerOf2_64(Mask))
8033
      return false;
8034

8035
    MachineOperand &MO = DefMI->getOperand(1);
8036
    Register NewReg = MO.getReg();
8037
    if (!NewReg.isVirtual())
8038
      return false;
8039

8040
    assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8041

8042
    MachineBasicBlock &RefToMBB = *MBB;
8043
    MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8044
    DebugLoc DL = MI.getDebugLoc();
8045
    unsigned Imm = Log2_64(Mask);
8046
    unsigned Opc = (Imm < 32)
8047
                       ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8048
                       : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8049
    MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8050
                              .addReg(NewReg)
8051
                              .addImm(Imm)
8052
                              .addMBB(TBB);
8053
    // Register lives on to the CBZ now.
8054
    MO.setIsKill(false);
8055

8056
    // For immediate smaller than 32, we need to use the 32-bit
8057
    // variant (W) in all cases. Indeed the 64-bit variant does not
8058
    // allow to encode them.
8059
    // Therefore, if the input register is 64-bit, we need to take the
8060
    // 32-bit sub-part.
8061
    if (!Is32Bit && Imm < 32)
8062
      NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8063
    MI.eraseFromParent();
8064
    return true;
8065
  }
8066
  // Look for CSINC
8067
  case AArch64::CSINCWr:
8068
  case AArch64::CSINCXr: {
8069
    if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8070
          DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8071
        !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8072
          DefMI->getOperand(2).getReg() == AArch64::XZR))
8073
      return false;
8074

8075
    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8076
                                         true) != -1)
8077
      return false;
8078

8079
    AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8080
    // Convert only when the condition code is not modified between
8081
    // the CSINC and the branch. The CC may be used by other
8082
    // instructions in between.
8083
    if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8084
      return false;
8085
    MachineBasicBlock &RefToMBB = *MBB;
8086
    MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8087
    DebugLoc DL = MI.getDebugLoc();
8088
    if (IsNegativeBranch)
8089
      CC = AArch64CC::getInvertedCondCode(CC);
8090
    BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8091
    MI.eraseFromParent();
8092
    return true;
8093
  }
8094
  }
8095
}
8096

8097
std::pair<unsigned, unsigned>
8098
AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8099
  const unsigned Mask = AArch64II::MO_FRAGMENT;
8100
  return std::make_pair(TF & Mask, TF & ~Mask);
8101
}
8102

8103
ArrayRef<std::pair<unsigned, const char *>>
8104
AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8105
  using namespace AArch64II;
8106

8107
  static const std::pair<unsigned, const char *> TargetFlags[] = {
8108
      {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8109
      {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
8110
      {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
8111
      {MO_HI12, "aarch64-hi12"}};
8112
  return ArrayRef(TargetFlags);
8113
}
8114

8115
ArrayRef<std::pair<unsigned, const char *>>
8116
AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8117
  using namespace AArch64II;
8118

8119
  static const std::pair<unsigned, const char *> TargetFlags[] = {
8120
      {MO_COFFSTUB, "aarch64-coffstub"},
8121
      {MO_GOT, "aarch64-got"},
8122
      {MO_NC, "aarch64-nc"},
8123
      {MO_S, "aarch64-s"},
8124
      {MO_TLS, "aarch64-tls"},
8125
      {MO_DLLIMPORT, "aarch64-dllimport"},
8126
      {MO_PREL, "aarch64-prel"},
8127
      {MO_TAGGED, "aarch64-tagged"},
8128
      {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8129
  };
8130
  return ArrayRef(TargetFlags);
8131
}
8132

8133
ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8134
AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8135
  static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8136
      {{MOSuppressPair, "aarch64-suppress-pair"},
8137
       {MOStridedAccess, "aarch64-strided-access"}};
8138
  return ArrayRef(TargetFlags);
8139
}
8140

8141
/// Constants defining how certain sequences should be outlined.
8142
/// This encompasses how an outlined function should be called, and what kind of
8143
/// frame should be emitted for that outlined function.
8144
///
8145
/// \p MachineOutlinerDefault implies that the function should be called with
8146
/// a save and restore of LR to the stack.
8147
///
8148
/// That is,
8149
///
8150
/// I1     Save LR                    OUTLINED_FUNCTION:
8151
/// I2 --> BL OUTLINED_FUNCTION       I1
8152
/// I3     Restore LR                 I2
8153
///                                   I3
8154
///                                   RET
8155
///
8156
/// * Call construction overhead: 3 (save + BL + restore)
8157
/// * Frame construction overhead: 1 (ret)
8158
/// * Requires stack fixups? Yes
8159
///
8160
/// \p MachineOutlinerTailCall implies that the function is being created from
8161
/// a sequence of instructions ending in a return.
8162
///
8163
/// That is,
8164
///
8165
/// I1                             OUTLINED_FUNCTION:
8166
/// I2 --> B OUTLINED_FUNCTION     I1
8167
/// RET                            I2
8168
///                                RET
8169
///
8170
/// * Call construction overhead: 1 (B)
8171
/// * Frame construction overhead: 0 (Return included in sequence)
8172
/// * Requires stack fixups? No
8173
///
8174
/// \p MachineOutlinerNoLRSave implies that the function should be called using
8175
/// a BL instruction, but doesn't require LR to be saved and restored. This
8176
/// happens when LR is known to be dead.
8177
///
8178
/// That is,
8179
///
8180
/// I1                                OUTLINED_FUNCTION:
8181
/// I2 --> BL OUTLINED_FUNCTION       I1
8182
/// I3                                I2
8183
///                                   I3
8184
///                                   RET
8185
///
8186
/// * Call construction overhead: 1 (BL)
8187
/// * Frame construction overhead: 1 (RET)
8188
/// * Requires stack fixups? No
8189
///
8190
/// \p MachineOutlinerThunk implies that the function is being created from
8191
/// a sequence of instructions ending in a call. The outlined function is
8192
/// called with a BL instruction, and the outlined function tail-calls the
8193
/// original call destination.
8194
///
8195
/// That is,
8196
///
8197
/// I1                                OUTLINED_FUNCTION:
8198
/// I2 --> BL OUTLINED_FUNCTION       I1
8199
/// BL f                              I2
8200
///                                   B f
8201
/// * Call construction overhead: 1 (BL)
8202
/// * Frame construction overhead: 0
8203
/// * Requires stack fixups? No
8204
///
8205
/// \p MachineOutlinerRegSave implies that the function should be called with a
8206
/// save and restore of LR to an available register. This allows us to avoid
8207
/// stack fixups. Note that this outlining variant is compatible with the
8208
/// NoLRSave case.
8209
///
8210
/// That is,
8211
///
8212
/// I1     Save LR                    OUTLINED_FUNCTION:
8213
/// I2 --> BL OUTLINED_FUNCTION       I1
8214
/// I3     Restore LR                 I2
8215
///                                   I3
8216
///                                   RET
8217
///
8218
/// * Call construction overhead: 3 (save + BL + restore)
8219
/// * Frame construction overhead: 1 (ret)
8220
/// * Requires stack fixups? No
8221
enum MachineOutlinerClass {
8222
  MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
8223
  MachineOutlinerTailCall, /// Only emit a branch.
8224
  MachineOutlinerNoLRSave, /// Emit a call and return.
8225
  MachineOutlinerThunk,    /// Emit a call and tail-call.
8226
  MachineOutlinerRegSave   /// Same as default, but save to a register.
8227
};
8228

8229
enum MachineOutlinerMBBFlags {
8230
  LRUnavailableSomewhere = 0x2,
8231
  HasCalls = 0x4,
8232
  UnsafeRegsDead = 0x8
8233
};
8234

8235
Register
8236
AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8237
  MachineFunction *MF = C.getMF();
8238
  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8239
  const AArch64RegisterInfo *ARI =
8240
      static_cast<const AArch64RegisterInfo *>(&TRI);
8241
  // Check if there is an available register across the sequence that we can
8242
  // use.
8243
  for (unsigned Reg : AArch64::GPR64RegClass) {
8244
    if (!ARI->isReservedReg(*MF, Reg) &&
8245
        Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
8246
        Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8247
        Reg != AArch64::X17 && // Ditto for X17.
8248
        C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8249
        C.isAvailableInsideSeq(Reg, TRI))
8250
      return Reg;
8251
  }
8252
  return Register();
8253
}
8254

8255
static bool
8256
outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8257
                                         const outliner::Candidate &b) {
8258
  const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8259
  const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8260

8261
  return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8262
         MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8263
}
8264

8265
static bool
8266
outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8267
                                       const outliner::Candidate &b) {
8268
  const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8269
  const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8270

8271
  return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8272
}
8273

8274
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8275
                                                const outliner::Candidate &b) {
8276
  const AArch64Subtarget &SubtargetA =
8277
      a.getMF()->getSubtarget<AArch64Subtarget>();
8278
  const AArch64Subtarget &SubtargetB =
8279
      b.getMF()->getSubtarget<AArch64Subtarget>();
8280
  return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8281
}
8282

8283
std::optional<outliner::OutlinedFunction>
8284
AArch64InstrInfo::getOutliningCandidateInfo(
8285
    std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8286
  unsigned SequenceSize = 0;
8287
  for (auto &MI : RepeatedSequenceLocs[0])
8288
    SequenceSize += getInstSizeInBytes(MI);
8289

8290
  unsigned NumBytesToCreateFrame = 0;
8291

8292
  // We only allow outlining for functions having exactly matching return
8293
  // address signing attributes, i.e., all share the same value for the
8294
  // attribute "sign-return-address" and all share the same type of key they
8295
  // are signed with.
8296
  // Additionally we require all functions to simultaniously either support
8297
  // v8.3a features or not. Otherwise an outlined function could get signed
8298
  // using dedicated v8.3 instructions and a call from a function that doesn't
8299
  // support v8.3 instructions would therefore be invalid.
8300
  if (std::adjacent_find(
8301
          RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8302
          [](const outliner::Candidate &a, const outliner::Candidate &b) {
8303
            // Return true if a and b are non-equal w.r.t. return address
8304
            // signing or support of v8.3a features
8305
            if (outliningCandidatesSigningScopeConsensus(a, b) &&
8306
                outliningCandidatesSigningKeyConsensus(a, b) &&
8307
                outliningCandidatesV8_3OpsConsensus(a, b)) {
8308
              return false;
8309
            }
8310
            return true;
8311
          }) != RepeatedSequenceLocs.end()) {
8312
    return std::nullopt;
8313
  }
8314

8315
  // Since at this point all candidates agree on their return address signing
8316
  // picking just one is fine. If the candidate functions potentially sign their
8317
  // return addresses, the outlined function should do the same. Note that in
8318
  // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8319
  // not certainly true that the outlined function will have to sign its return
8320
  // address but this decision is made later, when the decision to outline
8321
  // has already been made.
8322
  // The same holds for the number of additional instructions we need: On
8323
  // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8324
  // necessary. However, at this point we don't know if the outlined function
8325
  // will have a RET instruction so we assume the worst.
8326
  const TargetRegisterInfo &TRI = getRegisterInfo();
8327
  // Performing a tail call may require extra checks when PAuth is enabled.
8328
  // If PAuth is disabled, set it to zero for uniformity.
8329
  unsigned NumBytesToCheckLRInTCEpilogue = 0;
8330
  if (RepeatedSequenceLocs[0]
8331
          .getMF()
8332
          ->getInfo<AArch64FunctionInfo>()
8333
          ->shouldSignReturnAddress(true)) {
8334
    // One PAC and one AUT instructions
8335
    NumBytesToCreateFrame += 8;
8336

8337
    // PAuth is enabled - set extra tail call cost, if any.
8338
    auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
8339
        *RepeatedSequenceLocs[0].getMF());
8340
    NumBytesToCheckLRInTCEpilogue =
8341
        AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8342
    // Checking the authenticated LR value may significantly impact
8343
    // SequenceSize, so account for it for more precise results.
8344
    if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8345
      SequenceSize += NumBytesToCheckLRInTCEpilogue;
8346

8347
    // We have to check if sp modifying instructions would get outlined.
8348
    // If so we only allow outlining if sp is unchanged overall, so matching
8349
    // sub and add instructions are okay to outline, all other sp modifications
8350
    // are not
8351
    auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8352
      int SPValue = 0;
8353
      for (auto &MI : C) {
8354
        if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8355
          switch (MI.getOpcode()) {
8356
          case AArch64::ADDXri:
8357
          case AArch64::ADDWri:
8358
            assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8359
            assert(MI.getOperand(2).isImm() &&
8360
                   "Expected operand to be immediate");
8361
            assert(MI.getOperand(1).isReg() &&
8362
                   "Expected operand to be a register");
8363
            // Check if the add just increments sp. If so, we search for
8364
            // matching sub instructions that decrement sp. If not, the
8365
            // modification is illegal
8366
            if (MI.getOperand(1).getReg() == AArch64::SP)
8367
              SPValue += MI.getOperand(2).getImm();
8368
            else
8369
              return true;
8370
            break;
8371
          case AArch64::SUBXri:
8372
          case AArch64::SUBWri:
8373
            assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8374
            assert(MI.getOperand(2).isImm() &&
8375
                   "Expected operand to be immediate");
8376
            assert(MI.getOperand(1).isReg() &&
8377
                   "Expected operand to be a register");
8378
            // Check if the sub just decrements sp. If so, we search for
8379
            // matching add instructions that increment sp. If not, the
8380
            // modification is illegal
8381
            if (MI.getOperand(1).getReg() == AArch64::SP)
8382
              SPValue -= MI.getOperand(2).getImm();
8383
            else
8384
              return true;
8385
            break;
8386
          default:
8387
            return true;
8388
          }
8389
        }
8390
      }
8391
      if (SPValue)
8392
        return true;
8393
      return false;
8394
    };
8395
    // Remove candidates with illegal stack modifying instructions
8396
    llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8397

8398
    // If the sequence doesn't have enough candidates left, then we're done.
8399
    if (RepeatedSequenceLocs.size() < 2)
8400
      return std::nullopt;
8401
  }
8402

8403
  // Properties about candidate MBBs that hold for all of them.
8404
  unsigned FlagsSetInAll = 0xF;
8405

8406
  // Compute liveness information for each candidate, and set FlagsSetInAll.
8407
  for (outliner::Candidate &C : RepeatedSequenceLocs)
8408
    FlagsSetInAll &= C.Flags;
8409

8410
  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8411

8412
  // Helper lambda which sets call information for every candidate.
8413
  auto SetCandidateCallInfo =
8414
      [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8415
        for (outliner::Candidate &C : RepeatedSequenceLocs)
8416
          C.setCallInfo(CallID, NumBytesForCall);
8417
      };
8418

8419
  unsigned FrameID = MachineOutlinerDefault;
8420
  NumBytesToCreateFrame += 4;
8421

8422
  bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8423
    return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8424
  });
8425

8426
  // We check to see if CFI Instructions are present, and if they are
8427
  // we find the number of CFI Instructions in the candidates.
8428
  unsigned CFICount = 0;
8429
  for (auto &I : RepeatedSequenceLocs[0]) {
8430
    if (I.isCFIInstruction())
8431
      CFICount++;
8432
  }
8433

8434
  // We compare the number of found CFI Instructions to  the number of CFI
8435
  // instructions in the parent function for each candidate.  We must check this
8436
  // since if we outline one of the CFI instructions in a function, we have to
8437
  // outline them all for correctness. If we do not, the address offsets will be
8438
  // incorrect between the two sections of the program.
8439
  for (outliner::Candidate &C : RepeatedSequenceLocs) {
8440
    std::vector<MCCFIInstruction> CFIInstructions =
8441
        C.getMF()->getFrameInstructions();
8442

8443
    if (CFICount > 0 && CFICount != CFIInstructions.size())
8444
      return std::nullopt;
8445
  }
8446

8447
  // Returns true if an instructions is safe to fix up, false otherwise.
8448
  auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8449
    if (MI.isCall())
8450
      return true;
8451

8452
    if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8453
        !MI.readsRegister(AArch64::SP, &TRI))
8454
      return true;
8455

8456
    // Any modification of SP will break our code to save/restore LR.
8457
    // FIXME: We could handle some instructions which add a constant
8458
    // offset to SP, with a bit more work.
8459
    if (MI.modifiesRegister(AArch64::SP, &TRI))
8460
      return false;
8461

8462
    // At this point, we have a stack instruction that we might need to
8463
    // fix up. We'll handle it if it's a load or store.
8464
    if (MI.mayLoadOrStore()) {
8465
      const MachineOperand *Base; // Filled with the base operand of MI.
8466
      int64_t Offset;             // Filled with the offset of MI.
8467
      bool OffsetIsScalable;
8468

8469
      // Does it allow us to offset the base operand and is the base the
8470
      // register SP?
8471
      if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8472
          !Base->isReg() || Base->getReg() != AArch64::SP)
8473
        return false;
8474

8475
      // Fixe-up code below assumes bytes.
8476
      if (OffsetIsScalable)
8477
        return false;
8478

8479
      // Find the minimum/maximum offset for this instruction and check
8480
      // if fixing it up would be in range.
8481
      int64_t MinOffset,
8482
          MaxOffset;  // Unscaled offsets for the instruction.
8483
      // The scale to multiply the offsets by.
8484
      TypeSize Scale(0U, false), DummyWidth(0U, false);
8485
      getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8486

8487
      Offset += 16; // Update the offset to what it would be if we outlined.
8488
      if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8489
          Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8490
        return false;
8491

8492
      // It's in range, so we can outline it.
8493
      return true;
8494
    }
8495

8496
    // FIXME: Add handling for instructions like "add x0, sp, #8".
8497

8498
    // We can't fix it up, so don't outline it.
8499
    return false;
8500
  };
8501

8502
  // True if it's possible to fix up each stack instruction in this sequence.
8503
  // Important for frames/call variants that modify the stack.
8504
  bool AllStackInstrsSafe =
8505
      llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
8506

8507
  // If the last instruction in any candidate is a terminator, then we should
8508
  // tail call all of the candidates.
8509
  if (RepeatedSequenceLocs[0].back().isTerminator()) {
8510
    FrameID = MachineOutlinerTailCall;
8511
    NumBytesToCreateFrame = 0;
8512
    unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8513
    SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8514
  }
8515

8516
  else if (LastInstrOpcode == AArch64::BL ||
8517
           ((LastInstrOpcode == AArch64::BLR ||
8518
             LastInstrOpcode == AArch64::BLRNoIP) &&
8519
            !HasBTI)) {
8520
    // FIXME: Do we need to check if the code after this uses the value of LR?
8521
    FrameID = MachineOutlinerThunk;
8522
    NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8523
    SetCandidateCallInfo(MachineOutlinerThunk, 4);
8524
  }
8525

8526
  else {
8527
    // We need to decide how to emit calls + frames. We can always emit the same
8528
    // frame if we don't need to save to the stack. If we have to save to the
8529
    // stack, then we need a different frame.
8530
    unsigned NumBytesNoStackCalls = 0;
8531
    std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8532

8533
    // Check if we have to save LR.
8534
    for (outliner::Candidate &C : RepeatedSequenceLocs) {
8535
      bool LRAvailable =
8536
          (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8537
              ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8538
              : true;
8539
      // If we have a noreturn caller, then we're going to be conservative and
8540
      // say that we have to save LR. If we don't have a ret at the end of the
8541
      // block, then we can't reason about liveness accurately.
8542
      //
8543
      // FIXME: We can probably do better than always disabling this in
8544
      // noreturn functions by fixing up the liveness info.
8545
      bool IsNoReturn =
8546
          C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8547

8548
      // Is LR available? If so, we don't need a save.
8549
      if (LRAvailable && !IsNoReturn) {
8550
        NumBytesNoStackCalls += 4;
8551
        C.setCallInfo(MachineOutlinerNoLRSave, 4);
8552
        CandidatesWithoutStackFixups.push_back(C);
8553
      }
8554

8555
      // Is an unused register available? If so, we won't modify the stack, so
8556
      // we can outline with the same frame type as those that don't save LR.
8557
      else if (findRegisterToSaveLRTo(C)) {
8558
        NumBytesNoStackCalls += 12;
8559
        C.setCallInfo(MachineOutlinerRegSave, 12);
8560
        CandidatesWithoutStackFixups.push_back(C);
8561
      }
8562

8563
      // Is SP used in the sequence at all? If not, we don't have to modify
8564
      // the stack, so we are guaranteed to get the same frame.
8565
      else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8566
        NumBytesNoStackCalls += 12;
8567
        C.setCallInfo(MachineOutlinerDefault, 12);
8568
        CandidatesWithoutStackFixups.push_back(C);
8569
      }
8570

8571
      // If we outline this, we need to modify the stack. Pretend we don't
8572
      // outline this by saving all of its bytes.
8573
      else {
8574
        NumBytesNoStackCalls += SequenceSize;
8575
      }
8576
    }
8577

8578
    // If there are no places where we have to save LR, then note that we
8579
    // don't have to update the stack. Otherwise, give every candidate the
8580
    // default call type, as long as it's safe to do so.
8581
    if (!AllStackInstrsSafe ||
8582
        NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8583
      RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8584
      FrameID = MachineOutlinerNoLRSave;
8585
      if (RepeatedSequenceLocs.size() < 2)
8586
        return std::nullopt;
8587
    } else {
8588
      SetCandidateCallInfo(MachineOutlinerDefault, 12);
8589

8590
      // Bugzilla ID: 46767
8591
      // TODO: Check if fixing up the stack more than once is safe so we can
8592
      // outline these.
8593
      //
8594
      // An outline resulting in a caller that requires stack fixups at the
8595
      // callsite to a callee that also requires stack fixups can happen when
8596
      // there are no available registers at the candidate callsite for a
8597
      // candidate that itself also has calls.
8598
      //
8599
      // In other words if function_containing_sequence in the following pseudo
8600
      // assembly requires that we save LR at the point of the call, but there
8601
      // are no available registers: in this case we save using SP and as a
8602
      // result the SP offsets requires stack fixups by multiples of 16.
8603
      //
8604
      // function_containing_sequence:
8605
      //   ...
8606
      //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8607
      //   call OUTLINED_FUNCTION_N
8608
      //   restore LR from SP
8609
      //   ...
8610
      //
8611
      // OUTLINED_FUNCTION_N:
8612
      //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8613
      //   ...
8614
      //   bl foo
8615
      //   restore LR from SP
8616
      //   ret
8617
      //
8618
      // Because the code to handle more than one stack fixup does not
8619
      // currently have the proper checks for legality, these cases will assert
8620
      // in the AArch64 MachineOutliner. This is because the code to do this
8621
      // needs more hardening, testing, better checks that generated code is
8622
      // legal, etc and because it is only verified to handle a single pass of
8623
      // stack fixup.
8624
      //
8625
      // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8626
      // these cases until they are known to be handled. Bugzilla 46767 is
8627
      // referenced in comments at the assert site.
8628
      //
8629
      // To avoid asserting (or generating non-legal code on noassert builds)
8630
      // we remove all candidates which would need more than one stack fixup by
8631
      // pruning the cases where the candidate has calls while also having no
8632
      // available LR and having no available general purpose registers to copy
8633
      // LR to (ie one extra stack save/restore).
8634
      //
8635
      if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8636
        erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
8637
          auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8638
          return (llvm::any_of(C, IsCall)) &&
8639
                 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8640
                  !findRegisterToSaveLRTo(C));
8641
        });
8642
      }
8643
    }
8644

8645
    // If we dropped all of the candidates, bail out here.
8646
    if (RepeatedSequenceLocs.size() < 2) {
8647
      RepeatedSequenceLocs.clear();
8648
      return std::nullopt;
8649
    }
8650
  }
8651

8652
  // Does every candidate's MBB contain a call? If so, then we might have a call
8653
  // in the range.
8654
  if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8655
    // Check if the range contains a call. These require a save + restore of the
8656
    // link register.
8657
    outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8658
    bool ModStackToSaveLR = false;
8659
    if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
8660
                    [](const MachineInstr &MI) { return MI.isCall(); }))
8661
      ModStackToSaveLR = true;
8662

8663
    // Handle the last instruction separately. If this is a tail call, then the
8664
    // last instruction is a call. We don't want to save + restore in this case.
8665
    // However, it could be possible that the last instruction is a call without
8666
    // it being valid to tail call this sequence. We should consider this as
8667
    // well.
8668
    else if (FrameID != MachineOutlinerThunk &&
8669
             FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8670
      ModStackToSaveLR = true;
8671

8672
    if (ModStackToSaveLR) {
8673
      // We can't fix up the stack. Bail out.
8674
      if (!AllStackInstrsSafe) {
8675
        RepeatedSequenceLocs.clear();
8676
        return std::nullopt;
8677
      }
8678

8679
      // Save + restore LR.
8680
      NumBytesToCreateFrame += 8;
8681
    }
8682
  }
8683

8684
  // If we have CFI instructions, we can only outline if the outlined section
8685
  // can be a tail call
8686
  if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8687
    return std::nullopt;
8688

8689
  return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8690
                                    NumBytesToCreateFrame, FrameID);
8691
}
8692

8693
void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8694
    Function &F, std::vector<outliner::Candidate> &Candidates) const {
8695
  // If a bunch of candidates reach this point they must agree on their return
8696
  // address signing. It is therefore enough to just consider the signing
8697
  // behaviour of one of them
8698
  const auto &CFn = Candidates.front().getMF()->getFunction();
8699

8700
  if (CFn.hasFnAttribute("ptrauth-returns"))
8701
    F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
8702
  if (CFn.hasFnAttribute("ptrauth-auth-traps"))
8703
    F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
8704
  // Since all candidates belong to the same module, just copy the
8705
  // function-level attributes of an arbitrary function.
8706
  if (CFn.hasFnAttribute("sign-return-address"))
8707
    F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
8708
  if (CFn.hasFnAttribute("sign-return-address-key"))
8709
    F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
8710

8711
  AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8712
}
8713

8714
bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8715
    MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8716
  const Function &F = MF.getFunction();
8717

8718
  // Can F be deduplicated by the linker? If it can, don't outline from it.
8719
  if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8720
    return false;
8721

8722
  // Don't outline from functions with section markings; the program could
8723
  // expect that all the code is in the named section.
8724
  // FIXME: Allow outlining from multiple functions with the same section
8725
  // marking.
8726
  if (F.hasSection())
8727
    return false;
8728

8729
  // Outlining from functions with redzones is unsafe since the outliner may
8730
  // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8731
  // outline from it.
8732
  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8733
  if (!AFI || AFI->hasRedZone().value_or(true))
8734
    return false;
8735

8736
  // FIXME: Determine whether it is safe to outline from functions which contain
8737
  // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
8738
  // outlined together and ensure it is safe to outline with async unwind info,
8739
  // required for saving & restoring VG around calls.
8740
  if (AFI->hasStreamingModeChanges())
8741
    return false;
8742

8743
  // FIXME: Teach the outliner to generate/handle Windows unwind info.
8744
  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8745
    return false;
8746

8747
  // It's safe to outline from MF.
8748
  return true;
8749
}
8750

8751
SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8752
AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8753
                                      unsigned &Flags) const {
8754
  assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8755
         "Must track liveness!");
8756
  SmallVector<
8757
      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8758
      Ranges;
8759
  // According to the AArch64 Procedure Call Standard, the following are
8760
  // undefined on entry/exit from a function call:
8761
  //
8762
  // * Registers x16, x17, (and thus w16, w17)
8763
  // * Condition codes (and thus the NZCV register)
8764
  //
8765
  // If any of these registers are used inside or live across an outlined
8766
  // function, then they may be modified later, either by the compiler or
8767
  // some other tool (like the linker).
8768
  //
8769
  // To avoid outlining in these situations, partition each block into ranges
8770
  // where these registers are dead. We will only outline from those ranges.
8771
  LiveRegUnits LRU(getRegisterInfo());
8772
  auto AreAllUnsafeRegsDead = [&LRU]() {
8773
    return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8774
           LRU.available(AArch64::NZCV);
8775
  };
8776

8777
  // We need to know if LR is live across an outlining boundary later on in
8778
  // order to decide how we'll create the outlined call, frame, etc.
8779
  //
8780
  // It's pretty expensive to check this for *every candidate* within a block.
8781
  // That's some potentially n^2 behaviour, since in the worst case, we'd need
8782
  // to compute liveness from the end of the block for O(n) candidates within
8783
  // the block.
8784
  //
8785
  // So, to improve the average case, let's keep track of liveness from the end
8786
  // of the block to the beginning of *every outlinable range*. If we know that
8787
  // LR is available in every range we could outline from, then we know that
8788
  // we don't need to check liveness for any candidate within that range.
8789
  bool LRAvailableEverywhere = true;
8790
  // Compute liveness bottom-up.
8791
  LRU.addLiveOuts(MBB);
8792
  // Update flags that require info about the entire MBB.
8793
  auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8794
    if (MI.isCall() && !MI.isTerminator())
8795
      Flags |= MachineOutlinerMBBFlags::HasCalls;
8796
  };
8797
  // Range: [RangeBegin, RangeEnd)
8798
  MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8799
  unsigned RangeLen;
8800
  auto CreateNewRangeStartingAt =
8801
      [&RangeBegin, &RangeEnd,
8802
       &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8803
        RangeBegin = NewBegin;
8804
        RangeEnd = std::next(RangeBegin);
8805
        RangeLen = 0;
8806
      };
8807
  auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8808
    // At least one unsafe register is not dead. We do not want to outline at
8809
    // this point. If it is long enough to outline from, save the range
8810
    // [RangeBegin, RangeEnd).
8811
    if (RangeLen > 1)
8812
      Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
8813
  };
8814
  // Find the first point where all unsafe registers are dead.
8815
  // FIND: <safe instr> <-- end of first potential range
8816
  // SKIP: <unsafe def>
8817
  // SKIP: ... everything between ...
8818
  // SKIP: <unsafe use>
8819
  auto FirstPossibleEndPt = MBB.instr_rbegin();
8820
  for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8821
    LRU.stepBackward(*FirstPossibleEndPt);
8822
    // Update flags that impact how we outline across the entire block,
8823
    // regardless of safety.
8824
    UpdateWholeMBBFlags(*FirstPossibleEndPt);
8825
    if (AreAllUnsafeRegsDead())
8826
      break;
8827
  }
8828
  // If we exhausted the entire block, we have no safe ranges to outline.
8829
  if (FirstPossibleEndPt == MBB.instr_rend())
8830
    return Ranges;
8831
  // Current range.
8832
  CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8833
  // StartPt points to the first place where all unsafe registers
8834
  // are dead (if there is any such point). Begin partitioning the MBB into
8835
  // ranges.
8836
  for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
8837
    LRU.stepBackward(MI);
8838
    UpdateWholeMBBFlags(MI);
8839
    if (!AreAllUnsafeRegsDead()) {
8840
      SaveRangeIfNonEmpty();
8841
      CreateNewRangeStartingAt(MI.getIterator());
8842
      continue;
8843
    }
8844
    LRAvailableEverywhere &= LRU.available(AArch64::LR);
8845
    RangeBegin = MI.getIterator();
8846
    ++RangeLen;
8847
  }
8848
  // Above loop misses the last (or only) range. If we are still safe, then
8849
  // let's save the range.
8850
  if (AreAllUnsafeRegsDead())
8851
    SaveRangeIfNonEmpty();
8852
  if (Ranges.empty())
8853
    return Ranges;
8854
  // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8855
  // the order.
8856
  std::reverse(Ranges.begin(), Ranges.end());
8857
  // If there is at least one outlinable range where LR is unavailable
8858
  // somewhere, remember that.
8859
  if (!LRAvailableEverywhere)
8860
    Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8861
  return Ranges;
8862
}
8863

8864
outliner::InstrType
8865
AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8866
                                   unsigned Flags) const {
8867
  MachineInstr &MI = *MIT;
8868
  MachineBasicBlock *MBB = MI.getParent();
8869
  MachineFunction *MF = MBB->getParent();
8870
  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8871

8872
  // Don't outline anything used for return address signing. The outlined
8873
  // function will get signed later if needed
8874
  switch (MI.getOpcode()) {
8875
  case AArch64::PACM:
8876
  case AArch64::PACIASP:
8877
  case AArch64::PACIBSP:
8878
  case AArch64::PACIASPPC:
8879
  case AArch64::PACIBSPPC:
8880
  case AArch64::AUTIASP:
8881
  case AArch64::AUTIBSP:
8882
  case AArch64::AUTIASPPCi:
8883
  case AArch64::AUTIASPPCr:
8884
  case AArch64::AUTIBSPPCi:
8885
  case AArch64::AUTIBSPPCr:
8886
  case AArch64::RETAA:
8887
  case AArch64::RETAB:
8888
  case AArch64::RETAASPPCi:
8889
  case AArch64::RETAASPPCr:
8890
  case AArch64::RETABSPPCi:
8891
  case AArch64::RETABSPPCr:
8892
  case AArch64::EMITBKEY:
8893
  case AArch64::PAUTH_PROLOGUE:
8894
  case AArch64::PAUTH_EPILOGUE:
8895
    return outliner::InstrType::Illegal;
8896
  }
8897

8898
  // Don't outline LOHs.
8899
  if (FuncInfo->getLOHRelated().count(&MI))
8900
    return outliner::InstrType::Illegal;
8901

8902
  // We can only outline these if we will tail call the outlined function, or
8903
  // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8904
  // in a tail call.
8905
  //
8906
  // FIXME: If the proper fixups for the offset are implemented, this should be
8907
  // possible.
8908
  if (MI.isCFIInstruction())
8909
    return outliner::InstrType::Legal;
8910

8911
  // Is this a terminator for a basic block?
8912
  if (MI.isTerminator())
8913
    // TargetInstrInfo::getOutliningType has already filtered out anything
8914
    // that would break this, so we can allow it here.
8915
    return outliner::InstrType::Legal;
8916

8917
  // Make sure none of the operands are un-outlinable.
8918
  for (const MachineOperand &MOP : MI.operands()) {
8919
    // A check preventing CFI indices was here before, but only CFI
8920
    // instructions should have those.
8921
    assert(!MOP.isCFIIndex());
8922

8923
    // If it uses LR or W30 explicitly, then don't touch it.
8924
    if (MOP.isReg() && !MOP.isImplicit() &&
8925
        (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8926
      return outliner::InstrType::Illegal;
8927
  }
8928

8929
  // Special cases for instructions that can always be outlined, but will fail
8930
  // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8931
  // be outlined because they don't require a *specific* value to be in LR.
8932
  if (MI.getOpcode() == AArch64::ADRP)
8933
    return outliner::InstrType::Legal;
8934

8935
  // If MI is a call we might be able to outline it. We don't want to outline
8936
  // any calls that rely on the position of items on the stack. When we outline
8937
  // something containing a call, we have to emit a save and restore of LR in
8938
  // the outlined function. Currently, this always happens by saving LR to the
8939
  // stack. Thus, if we outline, say, half the parameters for a function call
8940
  // plus the call, then we'll break the callee's expectations for the layout
8941
  // of the stack.
8942
  //
8943
  // FIXME: Allow calls to functions which construct a stack frame, as long
8944
  // as they don't access arguments on the stack.
8945
  // FIXME: Figure out some way to analyze functions defined in other modules.
8946
  // We should be able to compute the memory usage based on the IR calling
8947
  // convention, even if we can't see the definition.
8948
  if (MI.isCall()) {
8949
    // Get the function associated with the call. Look at each operand and find
8950
    // the one that represents the callee and get its name.
8951
    const Function *Callee = nullptr;
8952
    for (const MachineOperand &MOP : MI.operands()) {
8953
      if (MOP.isGlobal()) {
8954
        Callee = dyn_cast<Function>(MOP.getGlobal());
8955
        break;
8956
      }
8957
    }
8958

8959
    // Never outline calls to mcount.  There isn't any rule that would require
8960
    // this, but the Linux kernel's "ftrace" feature depends on it.
8961
    if (Callee && Callee->getName() == "\01_mcount")
8962
      return outliner::InstrType::Illegal;
8963

8964
    // If we don't know anything about the callee, assume it depends on the
8965
    // stack layout of the caller. In that case, it's only legal to outline
8966
    // as a tail-call. Explicitly list the call instructions we know about so we
8967
    // don't get unexpected results with call pseudo-instructions.
8968
    auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8969
    if (MI.getOpcode() == AArch64::BLR ||
8970
        MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8971
      UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8972

8973
    if (!Callee)
8974
      return UnknownCallOutlineType;
8975

8976
    // We have a function we have information about. Check it if it's something
8977
    // can safely outline.
8978
    MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
8979

8980
    // We don't know what's going on with the callee at all. Don't touch it.
8981
    if (!CalleeMF)
8982
      return UnknownCallOutlineType;
8983

8984
    // Check if we know anything about the callee saves on the function. If we
8985
    // don't, then don't touch it, since that implies that we haven't
8986
    // computed anything about its stack frame yet.
8987
    MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8988
    if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8989
        MFI.getNumObjects() > 0)
8990
      return UnknownCallOutlineType;
8991

8992
    // At this point, we can say that CalleeMF ought to not pass anything on the
8993
    // stack. Therefore, we can outline it.
8994
    return outliner::InstrType::Legal;
8995
  }
8996

8997
  // Don't touch the link register or W30.
8998
  if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8999
      MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9000
    return outliner::InstrType::Illegal;
9001

9002
  // Don't outline BTI instructions, because that will prevent the outlining
9003
  // site from being indirectly callable.
9004
  if (hasBTISemantics(MI))
9005
    return outliner::InstrType::Illegal;
9006

9007
  return outliner::InstrType::Legal;
9008
}
9009

9010
void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9011
  for (MachineInstr &MI : MBB) {
9012
    const MachineOperand *Base;
9013
    TypeSize Width(0, false);
9014
    int64_t Offset;
9015
    bool OffsetIsScalable;
9016

9017
    // Is this a load or store with an immediate offset with SP as the base?
9018
    if (!MI.mayLoadOrStore() ||
9019
        !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9020
                                      &RI) ||
9021
        (Base->isReg() && Base->getReg() != AArch64::SP))
9022
      continue;
9023

9024
    // It is, so we have to fix it up.
9025
    TypeSize Scale(0U, false);
9026
    int64_t Dummy1, Dummy2;
9027

9028
    MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9029
    assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9030
    getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9031
    assert(Scale != 0 && "Unexpected opcode!");
9032
    assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9033

9034
    // We've pushed the return address to the stack, so add 16 to the offset.
9035
    // This is safe, since we already checked if it would overflow when we
9036
    // checked if this instruction was legal to outline.
9037
    int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9038
    StackOffsetOperand.setImm(NewImm);
9039
  }
9040
}
9041

9042
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9043
                                 const AArch64InstrInfo *TII,
9044
                                 bool ShouldSignReturnAddr) {
9045
  if (!ShouldSignReturnAddr)
9046
    return;
9047

9048
  BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9049
      .setMIFlag(MachineInstr::FrameSetup);
9050
  BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9051
          TII->get(AArch64::PAUTH_EPILOGUE))
9052
      .setMIFlag(MachineInstr::FrameDestroy);
9053
}
9054

9055
void AArch64InstrInfo::buildOutlinedFrame(
9056
    MachineBasicBlock &MBB, MachineFunction &MF,
9057
    const outliner::OutlinedFunction &OF) const {
9058

9059
  AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9060

9061
  if (OF.FrameConstructionID == MachineOutlinerTailCall)
9062
    FI->setOutliningStyle("Tail Call");
9063
  else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9064
    // For thunk outlining, rewrite the last instruction from a call to a
9065
    // tail-call.
9066
    MachineInstr *Call = &*--MBB.instr_end();
9067
    unsigned TailOpcode;
9068
    if (Call->getOpcode() == AArch64::BL) {
9069
      TailOpcode = AArch64::TCRETURNdi;
9070
    } else {
9071
      assert(Call->getOpcode() == AArch64::BLR ||
9072
             Call->getOpcode() == AArch64::BLRNoIP);
9073
      TailOpcode = AArch64::TCRETURNriALL;
9074
    }
9075
    MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9076
                           .add(Call->getOperand(0))
9077
                           .addImm(0);
9078
    MBB.insert(MBB.end(), TC);
9079
    Call->eraseFromParent();
9080

9081
    FI->setOutliningStyle("Thunk");
9082
  }
9083

9084
  bool IsLeafFunction = true;
9085

9086
  // Is there a call in the outlined range?
9087
  auto IsNonTailCall = [](const MachineInstr &MI) {
9088
    return MI.isCall() && !MI.isReturn();
9089
  };
9090

9091
  if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9092
    // Fix up the instructions in the range, since we're going to modify the
9093
    // stack.
9094

9095
    // Bugzilla ID: 46767
9096
    // TODO: Check if fixing up twice is safe so we can outline these.
9097
    assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9098
           "Can only fix up stack references once");
9099
    fixupPostOutline(MBB);
9100

9101
    IsLeafFunction = false;
9102

9103
    // LR has to be a live in so that we can save it.
9104
    if (!MBB.isLiveIn(AArch64::LR))
9105
      MBB.addLiveIn(AArch64::LR);
9106

9107
    MachineBasicBlock::iterator It = MBB.begin();
9108
    MachineBasicBlock::iterator Et = MBB.end();
9109

9110
    if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9111
        OF.FrameConstructionID == MachineOutlinerThunk)
9112
      Et = std::prev(MBB.end());
9113

9114
    // Insert a save before the outlined region
9115
    MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9116
                                .addReg(AArch64::SP, RegState::Define)
9117
                                .addReg(AArch64::LR)
9118
                                .addReg(AArch64::SP)
9119
                                .addImm(-16);
9120
    It = MBB.insert(It, STRXpre);
9121

9122
    if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9123
      const TargetSubtargetInfo &STI = MF.getSubtarget();
9124
      const MCRegisterInfo *MRI = STI.getRegisterInfo();
9125
      unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9126

9127
      // Add a CFI saying the stack was moved 16 B down.
9128
      int64_t StackPosEntry =
9129
          MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9130
      BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9131
          .addCFIIndex(StackPosEntry)
9132
          .setMIFlags(MachineInstr::FrameSetup);
9133

9134
      // Add a CFI saying that the LR that we want to find is now 16 B higher
9135
      // than before.
9136
      int64_t LRPosEntry = MF.addFrameInst(
9137
          MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9138
      BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9139
          .addCFIIndex(LRPosEntry)
9140
          .setMIFlags(MachineInstr::FrameSetup);
9141
    }
9142

9143
    // Insert a restore before the terminator for the function.
9144
    MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9145
                                 .addReg(AArch64::SP, RegState::Define)
9146
                                 .addReg(AArch64::LR, RegState::Define)
9147
                                 .addReg(AArch64::SP)
9148
                                 .addImm(16);
9149
    Et = MBB.insert(Et, LDRXpost);
9150
  }
9151

9152
  bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9153

9154
  // If this is a tail call outlined function, then there's already a return.
9155
  if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9156
      OF.FrameConstructionID == MachineOutlinerThunk) {
9157
    signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9158
    return;
9159
  }
9160

9161
  // It's not a tail call, so we have to insert the return ourselves.
9162

9163
  // LR has to be a live in so that we can return to it.
9164
  if (!MBB.isLiveIn(AArch64::LR))
9165
    MBB.addLiveIn(AArch64::LR);
9166

9167
  MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9168
                          .addReg(AArch64::LR);
9169
  MBB.insert(MBB.end(), ret);
9170

9171
  signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9172

9173
  FI->setOutliningStyle("Function");
9174

9175
  // Did we have to modify the stack by saving the link register?
9176
  if (OF.FrameConstructionID != MachineOutlinerDefault)
9177
    return;
9178

9179
  // We modified the stack.
9180
  // Walk over the basic block and fix up all the stack accesses.
9181
  fixupPostOutline(MBB);
9182
}
9183

9184
MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9185
    Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9186
    MachineFunction &MF, outliner::Candidate &C) const {
9187

9188
  // Are we tail calling?
9189
  if (C.CallConstructionID == MachineOutlinerTailCall) {
9190
    // If yes, then we can just branch to the label.
9191
    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9192
                            .addGlobalAddress(M.getNamedValue(MF.getName()))
9193
                            .addImm(0));
9194
    return It;
9195
  }
9196

9197
  // Are we saving the link register?
9198
  if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9199
      C.CallConstructionID == MachineOutlinerThunk) {
9200
    // No, so just insert the call.
9201
    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9202
                            .addGlobalAddress(M.getNamedValue(MF.getName())));
9203
    return It;
9204
  }
9205

9206
  // We want to return the spot where we inserted the call.
9207
  MachineBasicBlock::iterator CallPt;
9208

9209
  // Instructions for saving and restoring LR around the call instruction we're
9210
  // going to insert.
9211
  MachineInstr *Save;
9212
  MachineInstr *Restore;
9213
  // Can we save to a register?
9214
  if (C.CallConstructionID == MachineOutlinerRegSave) {
9215
    // FIXME: This logic should be sunk into a target-specific interface so that
9216
    // we don't have to recompute the register.
9217
    Register Reg = findRegisterToSaveLRTo(C);
9218
    assert(Reg && "No callee-saved register available?");
9219

9220
    // LR has to be a live in so that we can save it.
9221
    if (!MBB.isLiveIn(AArch64::LR))
9222
      MBB.addLiveIn(AArch64::LR);
9223

9224
    // Save and restore LR from Reg.
9225
    Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9226
               .addReg(AArch64::XZR)
9227
               .addReg(AArch64::LR)
9228
               .addImm(0);
9229
    Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9230
                .addReg(AArch64::XZR)
9231
                .addReg(Reg)
9232
                .addImm(0);
9233
  } else {
9234
    // We have the default case. Save and restore from SP.
9235
    Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9236
               .addReg(AArch64::SP, RegState::Define)
9237
               .addReg(AArch64::LR)
9238
               .addReg(AArch64::SP)
9239
               .addImm(-16);
9240
    Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9241
                  .addReg(AArch64::SP, RegState::Define)
9242
                  .addReg(AArch64::LR, RegState::Define)
9243
                  .addReg(AArch64::SP)
9244
                  .addImm(16);
9245
  }
9246

9247
  It = MBB.insert(It, Save);
9248
  It++;
9249

9250
  // Insert the call.
9251
  It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9252
                          .addGlobalAddress(M.getNamedValue(MF.getName())));
9253
  CallPt = It;
9254
  It++;
9255

9256
  It = MBB.insert(It, Restore);
9257
  return CallPt;
9258
}
9259

9260
bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9261
  MachineFunction &MF) const {
9262
  return MF.getFunction().hasMinSize();
9263
}
9264

9265
void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9266
                                          MachineBasicBlock::iterator Iter,
9267
                                          DebugLoc &DL,
9268
                                          bool AllowSideEffects) const {
9269
  const MachineFunction &MF = *MBB.getParent();
9270
  const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9271
  const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9272

9273
  if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9274
    BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9275
  } else if (STI.hasSVE()) {
9276
    BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9277
      .addImm(0)
9278
      .addImm(0);
9279
  } else {
9280
    BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9281
      .addImm(0);
9282
  }
9283
}
9284

9285
std::optional<DestSourcePair>
9286
AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9287

9288
  // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9289
  // and zero immediate operands used as an alias for mov instruction.
9290
  if (MI.getOpcode() == AArch64::ORRWrs &&
9291
      MI.getOperand(1).getReg() == AArch64::WZR &&
9292
      MI.getOperand(3).getImm() == 0x0 &&
9293
      // Check that the w->w move is not a zero-extending w->x mov.
9294
      (!MI.getOperand(0).getReg().isVirtual() ||
9295
       MI.getOperand(0).getSubReg() == 0) &&
9296
      (!MI.getOperand(0).getReg().isPhysical() ||
9297
       MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9298
                                        AArch64::X0,
9299
                                    /*TRI=*/nullptr) == -1))
9300
    return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9301

9302
  if (MI.getOpcode() == AArch64::ORRXrs &&
9303
      MI.getOperand(1).getReg() == AArch64::XZR &&
9304
      MI.getOperand(3).getImm() == 0x0)
9305
    return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9306

9307
  return std::nullopt;
9308
}
9309

9310
std::optional<DestSourcePair>
9311
AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9312
  if (MI.getOpcode() == AArch64::ORRWrs &&
9313
      MI.getOperand(1).getReg() == AArch64::WZR &&
9314
      MI.getOperand(3).getImm() == 0x0)
9315
    return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9316
  return std::nullopt;
9317
}
9318

9319
std::optional<RegImmPair>
9320
AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9321
  int Sign = 1;
9322
  int64_t Offset = 0;
9323

9324
  // TODO: Handle cases where Reg is a super- or sub-register of the
9325
  // destination register.
9326
  const MachineOperand &Op0 = MI.getOperand(0);
9327
  if (!Op0.isReg() || Reg != Op0.getReg())
9328
    return std::nullopt;
9329

9330
  switch (MI.getOpcode()) {
9331
  default:
9332
    return std::nullopt;
9333
  case AArch64::SUBWri:
9334
  case AArch64::SUBXri:
9335
  case AArch64::SUBSWri:
9336
  case AArch64::SUBSXri:
9337
    Sign *= -1;
9338
    [[fallthrough]];
9339
  case AArch64::ADDSWri:
9340
  case AArch64::ADDSXri:
9341
  case AArch64::ADDWri:
9342
  case AArch64::ADDXri: {
9343
    // TODO: Third operand can be global address (usually some string).
9344
    if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9345
        !MI.getOperand(2).isImm())
9346
      return std::nullopt;
9347
    int Shift = MI.getOperand(3).getImm();
9348
    assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9349
    Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9350
  }
9351
  }
9352
  return RegImmPair{MI.getOperand(1).getReg(), Offset};
9353
}
9354

9355
/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9356
/// the destination register then, if possible, describe the value in terms of
9357
/// the source register.
9358
static std::optional<ParamLoadedValue>
9359
describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9360
                       const TargetInstrInfo *TII,
9361
                       const TargetRegisterInfo *TRI) {
9362
  auto DestSrc = TII->isCopyLikeInstr(MI);
9363
  if (!DestSrc)
9364
    return std::nullopt;
9365

9366
  Register DestReg = DestSrc->Destination->getReg();
9367
  Register SrcReg = DestSrc->Source->getReg();
9368

9369
  auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9370

9371
  // If the described register is the destination, just return the source.
9372
  if (DestReg == DescribedReg)
9373
    return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9374

9375
  // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9376
  if (MI.getOpcode() == AArch64::ORRWrs &&
9377
      TRI->isSuperRegister(DestReg, DescribedReg))
9378
    return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9379

9380
  // We may need to describe the lower part of a ORRXrs move.
9381
  if (MI.getOpcode() == AArch64::ORRXrs &&
9382
      TRI->isSubRegister(DestReg, DescribedReg)) {
9383
    Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9384
    return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9385
  }
9386

9387
  assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9388
         "Unhandled ORR[XW]rs copy case");
9389

9390
  return std::nullopt;
9391
}
9392

9393
bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9394
  // Functions cannot be split to different sections on AArch64 if they have
9395
  // a red zone. This is because relaxing a cross-section branch may require
9396
  // incrementing the stack pointer to spill a register, which would overwrite
9397
  // the red zone.
9398
  if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9399
    return false;
9400

9401
  return TargetInstrInfo::isFunctionSafeToSplit(MF);
9402
}
9403

9404
bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9405
    const MachineBasicBlock &MBB) const {
9406
  // Asm Goto blocks can contain conditional branches to goto labels, which can
9407
  // get moved out of range of the branch instruction.
9408
  auto isAsmGoto = [](const MachineInstr &MI) {
9409
    return MI.getOpcode() == AArch64::INLINEASM_BR;
9410
  };
9411
  if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9412
    return false;
9413

9414
  // Because jump tables are label-relative instead of table-relative, they all
9415
  // must be in the same section or relocation fixup handling will fail.
9416

9417
  // Check if MBB is a jump table target
9418
  const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9419
  auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9420
    return llvm::is_contained(JTE.MBBs, &MBB);
9421
  };
9422
  if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9423
    return false;
9424

9425
  // Check if MBB contains a jump table lookup
9426
  for (const MachineInstr &MI : MBB) {
9427
    switch (MI.getOpcode()) {
9428
    case TargetOpcode::G_BRJT:
9429
    case AArch64::JumpTableDest32:
9430
    case AArch64::JumpTableDest16:
9431
    case AArch64::JumpTableDest8:
9432
      return false;
9433
    default:
9434
      continue;
9435
    }
9436
  }
9437

9438
  // MBB isn't a special case, so it's safe to be split to the cold section.
9439
  return true;
9440
}
9441

9442
std::optional<ParamLoadedValue>
9443
AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9444
                                      Register Reg) const {
9445
  const MachineFunction *MF = MI.getMF();
9446
  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9447
  switch (MI.getOpcode()) {
9448
  case AArch64::MOVZWi:
9449
  case AArch64::MOVZXi: {
9450
    // MOVZWi may be used for producing zero-extended 32-bit immediates in
9451
    // 64-bit parameters, so we need to consider super-registers.
9452
    if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9453
      return std::nullopt;
9454

9455
    if (!MI.getOperand(1).isImm())
9456
      return std::nullopt;
9457
    int64_t Immediate = MI.getOperand(1).getImm();
9458
    int Shift = MI.getOperand(2).getImm();
9459
    return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9460
                            nullptr);
9461
  }
9462
  case AArch64::ORRWrs:
9463
  case AArch64::ORRXrs:
9464
    return describeORRLoadedValue(MI, Reg, this, TRI);
9465
  }
9466

9467
  return TargetInstrInfo::describeLoadedValue(MI, Reg);
9468
}
9469

9470
bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9471
    MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9472
  assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9473
         ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9474
         ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9475

9476
  // Anyexts are nops.
9477
  if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9478
    return true;
9479

9480
  Register DefReg = ExtMI.getOperand(0).getReg();
9481
  if (!MRI.hasOneNonDBGUse(DefReg))
9482
    return false;
9483

9484
  // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9485
  // addressing mode.
9486
  auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9487
  return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9488
}
9489

9490
uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9491
  return get(Opc).TSFlags & AArch64::ElementSizeMask;
9492
}
9493

9494
bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9495
  return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9496
}
9497

9498
bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9499
  return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9500
}
9501

9502
unsigned int
9503
AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9504
  return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9505
}
9506

9507
bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9508
                                             unsigned Scale) const {
9509
  if (Offset && Scale)
9510
    return false;
9511

9512
  // Check Reg + Imm
9513
  if (!Scale) {
9514
    // 9-bit signed offset
9515
    if (isInt<9>(Offset))
9516
      return true;
9517

9518
    // 12-bit unsigned offset
9519
    unsigned Shift = Log2_64(NumBytes);
9520
    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9521
        // Must be a multiple of NumBytes (NumBytes is a power of 2)
9522
        (Offset >> Shift) << Shift == Offset)
9523
      return true;
9524
    return false;
9525
  }
9526

9527
  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9528
  return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9529
}
9530

9531
unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9532
  if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9533
    return AArch64::BLRNoIP;
9534
  else
9535
    return AArch64::BLR;
9536
}
9537

9538
MachineBasicBlock::iterator
9539
AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9540
                                   Register TargetReg, bool FrameSetup) const {
9541
  assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9542

9543
  MachineBasicBlock &MBB = *MBBI->getParent();
9544
  MachineFunction &MF = *MBB.getParent();
9545
  const AArch64InstrInfo *TII =
9546
      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9547
  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9548
  DebugLoc DL = MBB.findDebugLoc(MBBI);
9549

9550
  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
9551
  MachineBasicBlock *LoopTestMBB =
9552
      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9553
  MF.insert(MBBInsertPoint, LoopTestMBB);
9554
  MachineBasicBlock *LoopBodyMBB =
9555
      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9556
  MF.insert(MBBInsertPoint, LoopBodyMBB);
9557
  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9558
  MF.insert(MBBInsertPoint, ExitMBB);
9559
  MachineInstr::MIFlag Flags =
9560
      FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9561

9562
  // LoopTest:
9563
  //   SUB SP, SP, #ProbeSize
9564
  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9565
                  AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9566

9567
  //   CMP SP, TargetReg
9568
  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9569
          AArch64::XZR)
9570
      .addReg(AArch64::SP)
9571
      .addReg(TargetReg)
9572
      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9573
      .setMIFlags(Flags);
9574

9575
  //   B.<Cond> LoopExit
9576
  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9577
      .addImm(AArch64CC::LE)
9578
      .addMBB(ExitMBB)
9579
      .setMIFlags(Flags);
9580

9581
  //   STR XZR, [SP]
9582
  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9583
      .addReg(AArch64::XZR)
9584
      .addReg(AArch64::SP)
9585
      .addImm(0)
9586
      .setMIFlags(Flags);
9587

9588
  //   B loop
9589
  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9590
      .addMBB(LoopTestMBB)
9591
      .setMIFlags(Flags);
9592

9593
  // LoopExit:
9594
  //   MOV SP, TargetReg
9595
  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9596
      .addReg(TargetReg)
9597
      .addImm(0)
9598
      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9599
      .setMIFlags(Flags);
9600

9601
  //   LDR XZR, [SP]
9602
  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9603
      .addReg(AArch64::XZR, RegState::Define)
9604
      .addReg(AArch64::SP)
9605
      .addImm(0)
9606
      .setMIFlags(Flags);
9607

9608
  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
9609
  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
9610

9611
  LoopTestMBB->addSuccessor(ExitMBB);
9612
  LoopTestMBB->addSuccessor(LoopBodyMBB);
9613
  LoopBodyMBB->addSuccessor(LoopTestMBB);
9614
  MBB.addSuccessor(LoopTestMBB);
9615

9616
  // Update liveins.
9617
  if (MF.getRegInfo().reservedRegsFrozen())
9618
    fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
9619

9620
  return ExitMBB->begin();
9621
}
9622

9623
namespace {
9624
class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
9625
  MachineFunction *MF;
9626
  const TargetInstrInfo *TII;
9627
  const TargetRegisterInfo *TRI;
9628
  MachineRegisterInfo &MRI;
9629

9630
  /// The block of the loop
9631
  MachineBasicBlock *LoopBB;
9632
  /// The conditional branch of the loop
9633
  MachineInstr *CondBranch;
9634
  /// The compare instruction for loop control
9635
  MachineInstr *Comp;
9636
  /// The number of the operand of the loop counter value in Comp
9637
  unsigned CompCounterOprNum;
9638
  /// The instruction that updates the loop counter value
9639
  MachineInstr *Update;
9640
  /// The number of the operand of the loop counter value in Update
9641
  unsigned UpdateCounterOprNum;
9642
  /// The initial value of the loop counter
9643
  Register Init;
9644
  /// True iff Update is a predecessor of Comp
9645
  bool IsUpdatePriorComp;
9646

9647
  /// The normalized condition used by createTripCountGreaterCondition()
9648
  SmallVector<MachineOperand, 4> Cond;
9649

9650
public:
9651
  AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
9652
                           MachineInstr *Comp, unsigned CompCounterOprNum,
9653
                           MachineInstr *Update, unsigned UpdateCounterOprNum,
9654
                           Register Init, bool IsUpdatePriorComp,
9655
                           const SmallVectorImpl<MachineOperand> &Cond)
9656
      : MF(Comp->getParent()->getParent()),
9657
        TII(MF->getSubtarget().getInstrInfo()),
9658
        TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
9659
        LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
9660
        CompCounterOprNum(CompCounterOprNum), Update(Update),
9661
        UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
9662
        IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
9663

9664
  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
9665
    // Make the instructions for loop control be placed in stage 0.
9666
    // The predecessors of Comp are considered by the caller.
9667
    return MI == Comp;
9668
  }
9669

9670
  std::optional<bool> createTripCountGreaterCondition(
9671
      int TC, MachineBasicBlock &MBB,
9672
      SmallVectorImpl<MachineOperand> &CondParam) override {
9673
    // A branch instruction will be inserted as "if (Cond) goto epilogue".
9674
    // Cond is normalized for such use.
9675
    // The predecessors of the branch are assumed to have already been inserted.
9676
    CondParam = Cond;
9677
    return {};
9678
  }
9679

9680
  void createRemainingIterationsGreaterCondition(
9681
      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9682
      DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
9683

9684
  void setPreheader(MachineBasicBlock *NewPreheader) override {}
9685

9686
  void adjustTripCount(int TripCountAdjust) override {}
9687

9688
  void disposed() override {}
9689
  bool isMVEExpanderSupported() override { return true; }
9690
};
9691
} // namespace
9692

9693
/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
9694
/// is replaced by ReplaceReg. The output register is newly created.
9695
/// The other operands are unchanged from MI.
9696
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
9697
                           Register ReplaceReg, MachineBasicBlock &MBB,
9698
                           MachineBasicBlock::iterator InsertTo) {
9699
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9700
  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
9701
  const TargetRegisterInfo *TRI =
9702
      MBB.getParent()->getSubtarget().getRegisterInfo();
9703
  MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
9704
  Register Result = 0;
9705
  for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
9706
    if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
9707
      Result = MRI.createVirtualRegister(
9708
          MRI.getRegClass(NewMI->getOperand(0).getReg()));
9709
      NewMI->getOperand(I).setReg(Result);
9710
    } else if (I == ReplaceOprNum) {
9711
      MRI.constrainRegClass(
9712
          ReplaceReg,
9713
          TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
9714
      NewMI->getOperand(I).setReg(ReplaceReg);
9715
    }
9716
  }
9717
  MBB.insert(InsertTo, NewMI);
9718
  return Result;
9719
}
9720

9721
void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
9722
    int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9723
    DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
9724
  // Create and accumulate conditions for next TC iterations.
9725
  // Example:
9726
  //   SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
9727
  //                                          # iteration of the kernel
9728
  //
9729
  //   # insert the following instructions
9730
  //   cond = CSINCXr 0, 0, C, implicit $nzcv
9731
  //   counter = ADDXri counter, 1            # clone from this->Update
9732
  //   SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
9733
  //   cond = CSINCXr cond, cond, C, implicit $nzcv
9734
  //   ... (repeat TC times)
9735
  //   SUBSXri cond, 0, implicit-def $nzcv
9736

9737
  assert(CondBranch->getOpcode() == AArch64::Bcc);
9738
  // CondCode to exit the loop
9739
  AArch64CC::CondCode CC =
9740
      (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
9741
  if (CondBranch->getOperand(1).getMBB() == LoopBB)
9742
    CC = AArch64CC::getInvertedCondCode(CC);
9743

9744
  // Accumulate conditions to exit the loop
9745
  Register AccCond = AArch64::XZR;
9746

9747
  // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
9748
  auto AccumulateCond = [&](Register CurCond,
9749
                            AArch64CC::CondCode CC) -> Register {
9750
    Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
9751
    BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
9752
        .addReg(NewCond, RegState::Define)
9753
        .addReg(CurCond)
9754
        .addReg(CurCond)
9755
        .addImm(AArch64CC::getInvertedCondCode(CC));
9756
    return NewCond;
9757
  };
9758

9759
  if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
9760
    // Update and Comp for I==0 are already exists in MBB
9761
    // (MBB is an unrolled kernel)
9762
    Register Counter;
9763
    for (int I = 0; I <= TC; ++I) {
9764
      Register NextCounter;
9765
      if (I != 0)
9766
        NextCounter =
9767
            cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9768

9769
      AccCond = AccumulateCond(AccCond, CC);
9770

9771
      if (I != TC) {
9772
        if (I == 0) {
9773
          if (Update != Comp && IsUpdatePriorComp) {
9774
            Counter =
9775
                LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9776
            NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
9777
                                     MBB.end());
9778
          } else {
9779
            // can use already calculated value
9780
            NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
9781
          }
9782
        } else if (Update != Comp) {
9783
          NextCounter =
9784
              cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9785
        }
9786
      }
9787
      Counter = NextCounter;
9788
    }
9789
  } else {
9790
    Register Counter;
9791
    if (LastStage0Insts.empty()) {
9792
      // use initial counter value (testing if the trip count is sufficient to
9793
      // be executed by pipelined code)
9794
      Counter = Init;
9795
      if (IsUpdatePriorComp)
9796
        Counter =
9797
            cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9798
    } else {
9799
      // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
9800
      Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9801
    }
9802

9803
    for (int I = 0; I <= TC; ++I) {
9804
      Register NextCounter;
9805
      NextCounter =
9806
          cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9807
      AccCond = AccumulateCond(AccCond, CC);
9808
      if (I != TC && Update != Comp)
9809
        NextCounter =
9810
            cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9811
      Counter = NextCounter;
9812
    }
9813
  }
9814

9815
  // If AccCond == 0, the remainder is greater than TC.
9816
  BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
9817
      .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
9818
      .addReg(AccCond)
9819
      .addImm(0)
9820
      .addImm(0);
9821
  Cond.clear();
9822
  Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
9823
}
9824

9825
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
9826
                          Register &RegMBB, Register &RegOther) {
9827
  assert(Phi.getNumOperands() == 5);
9828
  if (Phi.getOperand(2).getMBB() == MBB) {
9829
    RegMBB = Phi.getOperand(1).getReg();
9830
    RegOther = Phi.getOperand(3).getReg();
9831
  } else {
9832
    assert(Phi.getOperand(4).getMBB() == MBB);
9833
    RegMBB = Phi.getOperand(3).getReg();
9834
    RegOther = Phi.getOperand(1).getReg();
9835
  }
9836
}
9837

9838
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
9839
  if (!Reg.isVirtual())
9840
    return false;
9841
  const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
9842
  return MRI.getVRegDef(Reg)->getParent() != BB;
9843
}
9844

9845
/// If Reg is an induction variable, return true and set some parameters
9846
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
9847
                          MachineInstr *&UpdateInst,
9848
                          unsigned &UpdateCounterOprNum, Register &InitReg,
9849
                          bool &IsUpdatePriorComp) {
9850
  // Example:
9851
  //
9852
  // Preheader:
9853
  //   InitReg = ...
9854
  // LoopBB:
9855
  //   Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
9856
  //   Reg = COPY Reg0 ; COPY is ignored.
9857
  //   Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
9858
  //                     ; Reg is the value calculated in the previous
9859
  //                     ; iteration, so IsUpdatePriorComp == false.
9860

9861
  if (LoopBB->pred_size() != 2)
9862
    return false;
9863
  if (!Reg.isVirtual())
9864
    return false;
9865
  const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
9866
  UpdateInst = nullptr;
9867
  UpdateCounterOprNum = 0;
9868
  InitReg = 0;
9869
  IsUpdatePriorComp = true;
9870
  Register CurReg = Reg;
9871
  while (true) {
9872
    MachineInstr *Def = MRI.getVRegDef(CurReg);
9873
    if (Def->getParent() != LoopBB)
9874
      return false;
9875
    if (Def->isCopy()) {
9876
      // Ignore copy instructions unless they contain subregisters
9877
      if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
9878
        return false;
9879
      CurReg = Def->getOperand(1).getReg();
9880
    } else if (Def->isPHI()) {
9881
      if (InitReg != 0)
9882
        return false;
9883
      if (!UpdateInst)
9884
        IsUpdatePriorComp = false;
9885
      extractPhiReg(*Def, LoopBB, CurReg, InitReg);
9886
    } else {
9887
      if (UpdateInst)
9888
        return false;
9889
      switch (Def->getOpcode()) {
9890
      case AArch64::ADDSXri:
9891
      case AArch64::ADDSWri:
9892
      case AArch64::SUBSXri:
9893
      case AArch64::SUBSWri:
9894
      case AArch64::ADDXri:
9895
      case AArch64::ADDWri:
9896
      case AArch64::SUBXri:
9897
      case AArch64::SUBWri:
9898
        UpdateInst = Def;
9899
        UpdateCounterOprNum = 1;
9900
        break;
9901
      case AArch64::ADDSXrr:
9902
      case AArch64::ADDSWrr:
9903
      case AArch64::SUBSXrr:
9904
      case AArch64::SUBSWrr:
9905
      case AArch64::ADDXrr:
9906
      case AArch64::ADDWrr:
9907
      case AArch64::SUBXrr:
9908
      case AArch64::SUBWrr:
9909
        UpdateInst = Def;
9910
        if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
9911
          UpdateCounterOprNum = 1;
9912
        else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
9913
          UpdateCounterOprNum = 2;
9914
        else
9915
          return false;
9916
        break;
9917
      default:
9918
        return false;
9919
      }
9920
      CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
9921
    }
9922

9923
    if (!CurReg.isVirtual())
9924
      return false;
9925
    if (Reg == CurReg)
9926
      break;
9927
  }
9928

9929
  if (!UpdateInst)
9930
    return false;
9931

9932
  return true;
9933
}
9934

9935
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
9936
AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
9937
  // Accept loops that meet the following conditions
9938
  // * The conditional branch is BCC
9939
  // * The compare instruction is ADDS/SUBS/WHILEXX
9940
  // * One operand of the compare is an induction variable and the other is a
9941
  //   loop invariant value
9942
  // * The induction variable is incremented/decremented by a single instruction
9943
  // * Does not contain CALL or instructions which have unmodeled side effects
9944

9945
  for (MachineInstr &MI : *LoopBB)
9946
    if (MI.isCall() || MI.hasUnmodeledSideEffects())
9947
      // This instruction may use NZCV, which interferes with the instruction to
9948
      // be inserted for loop control.
9949
      return nullptr;
9950

9951
  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
9952
  SmallVector<MachineOperand, 4> Cond;
9953
  if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
9954
    return nullptr;
9955

9956
  // Infinite loops are not supported
9957
  if (TBB == LoopBB && FBB == LoopBB)
9958
    return nullptr;
9959

9960
  // Must be conditional branch
9961
  if (TBB != LoopBB && FBB == nullptr)
9962
    return nullptr;
9963

9964
  assert((TBB == LoopBB || FBB == LoopBB) &&
9965
         "The Loop must be a single-basic-block loop");
9966

9967
  MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
9968
  const TargetRegisterInfo &TRI = getRegisterInfo();
9969

9970
  if (CondBranch->getOpcode() != AArch64::Bcc)
9971
    return nullptr;
9972

9973
  // Normalization for createTripCountGreaterCondition()
9974
  if (TBB == LoopBB)
9975
    reverseBranchCondition(Cond);
9976

9977
  MachineInstr *Comp = nullptr;
9978
  unsigned CompCounterOprNum = 0;
9979
  for (MachineInstr &MI : reverse(*LoopBB)) {
9980
    if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
9981
      // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
9982
      // operands is a loop invariant value
9983

9984
      switch (MI.getOpcode()) {
9985
      case AArch64::SUBSXri:
9986
      case AArch64::SUBSWri:
9987
      case AArch64::ADDSXri:
9988
      case AArch64::ADDSWri:
9989
        Comp = &MI;
9990
        CompCounterOprNum = 1;
9991
        break;
9992
      case AArch64::ADDSWrr:
9993
      case AArch64::ADDSXrr:
9994
      case AArch64::SUBSWrr:
9995
      case AArch64::SUBSXrr:
9996
        Comp = &MI;
9997
        break;
9998
      default:
9999
        if (isWhileOpcode(MI.getOpcode())) {
10000
          Comp = &MI;
10001
          break;
10002
        }
10003
        return nullptr;
10004
      }
10005

10006
      if (CompCounterOprNum == 0) {
10007
        if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10008
          CompCounterOprNum = 2;
10009
        else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10010
          CompCounterOprNum = 1;
10011
        else
10012
          return nullptr;
10013
      }
10014
      break;
10015
    }
10016
  }
10017
  if (!Comp)
10018
    return nullptr;
10019

10020
  MachineInstr *Update = nullptr;
10021
  Register Init;
10022
  bool IsUpdatePriorComp;
10023
  unsigned UpdateCounterOprNum;
10024
  if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10025
                     Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10026
    return nullptr;
10027

10028
  return std::make_unique<AArch64PipelinerLoopInfo>(
10029
      LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10030
      Init, IsUpdatePriorComp, Cond);
10031
}
10032

10033
#define GET_INSTRINFO_HELPERS
10034
#define GET_INSTRMAP_INFO
10035
#include "AArch64GenInstrInfo.inc"
10036

10037
Product

Resources

Company