Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
35269 views
1
//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file contains the AArch64 implementation of the TargetInstrInfo class.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "AArch64InstrInfo.h"
14
#include "AArch64ExpandImm.h"
15
#include "AArch64FrameLowering.h"
16
#include "AArch64MachineFunctionInfo.h"
17
#include "AArch64PointerAuth.h"
18
#include "AArch64Subtarget.h"
19
#include "MCTargetDesc/AArch64AddressingModes.h"
20
#include "MCTargetDesc/AArch64MCTargetDesc.h"
21
#include "Utils/AArch64BaseInfo.h"
22
#include "llvm/ADT/ArrayRef.h"
23
#include "llvm/ADT/STLExtras.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/CodeGen/LivePhysRegs.h"
26
#include "llvm/CodeGen/MachineBasicBlock.h"
27
#include "llvm/CodeGen/MachineCombinerPattern.h"
28
#include "llvm/CodeGen/MachineFrameInfo.h"
29
#include "llvm/CodeGen/MachineFunction.h"
30
#include "llvm/CodeGen/MachineInstr.h"
31
#include "llvm/CodeGen/MachineInstrBuilder.h"
32
#include "llvm/CodeGen/MachineMemOperand.h"
33
#include "llvm/CodeGen/MachineModuleInfo.h"
34
#include "llvm/CodeGen/MachineOperand.h"
35
#include "llvm/CodeGen/MachineRegisterInfo.h"
36
#include "llvm/CodeGen/RegisterScavenging.h"
37
#include "llvm/CodeGen/StackMaps.h"
38
#include "llvm/CodeGen/TargetRegisterInfo.h"
39
#include "llvm/CodeGen/TargetSubtargetInfo.h"
40
#include "llvm/IR/DebugInfoMetadata.h"
41
#include "llvm/IR/DebugLoc.h"
42
#include "llvm/IR/GlobalValue.h"
43
#include "llvm/IR/Module.h"
44
#include "llvm/MC/MCAsmInfo.h"
45
#include "llvm/MC/MCInst.h"
46
#include "llvm/MC/MCInstBuilder.h"
47
#include "llvm/MC/MCInstrDesc.h"
48
#include "llvm/Support/Casting.h"
49
#include "llvm/Support/CodeGen.h"
50
#include "llvm/Support/CommandLine.h"
51
#include "llvm/Support/ErrorHandling.h"
52
#include "llvm/Support/LEB128.h"
53
#include "llvm/Support/MathExtras.h"
54
#include "llvm/Target/TargetMachine.h"
55
#include "llvm/Target/TargetOptions.h"
56
#include <cassert>
57
#include <cstdint>
58
#include <iterator>
59
#include <utility>
60
61
using namespace llvm;
62
63
#define GET_INSTRINFO_CTOR_DTOR
64
#include "AArch64GenInstrInfo.inc"
65
66
static cl::opt<unsigned> TBZDisplacementBits(
67
"aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
68
cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
69
70
static cl::opt<unsigned> CBZDisplacementBits(
71
"aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
72
cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
73
74
static cl::opt<unsigned>
75
BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
76
cl::desc("Restrict range of Bcc instructions (DEBUG)"));
77
78
static cl::opt<unsigned>
79
BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
80
cl::desc("Restrict range of B instructions (DEBUG)"));
81
82
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
83
: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
84
AArch64::CATCHRET),
85
RI(STI.getTargetTriple()), Subtarget(STI) {}
86
87
/// GetInstSize - Return the number of bytes of code the specified
88
/// instruction may be. This returns the maximum number of bytes.
89
unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
90
const MachineBasicBlock &MBB = *MI.getParent();
91
const MachineFunction *MF = MBB.getParent();
92
const Function &F = MF->getFunction();
93
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
94
95
{
96
auto Op = MI.getOpcode();
97
if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
98
return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
99
}
100
101
// Meta-instructions emit no code.
102
if (MI.isMetaInstruction())
103
return 0;
104
105
// FIXME: We currently only handle pseudoinstructions that don't get expanded
106
// before the assembly printer.
107
unsigned NumBytes = 0;
108
const MCInstrDesc &Desc = MI.getDesc();
109
110
// Size should be preferably set in
111
// llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
112
// Specific cases handle instructions of variable sizes
113
switch (Desc.getOpcode()) {
114
default:
115
if (Desc.getSize())
116
return Desc.getSize();
117
118
// Anything not explicitly designated otherwise (i.e. pseudo-instructions
119
// with fixed constant size but not specified in .td file) is a normal
120
// 4-byte insn.
121
NumBytes = 4;
122
break;
123
case TargetOpcode::STACKMAP:
124
// The upper bound for a stackmap intrinsic is the full length of its shadow
125
NumBytes = StackMapOpers(&MI).getNumPatchBytes();
126
assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
127
break;
128
case TargetOpcode::PATCHPOINT:
129
// The size of the patchpoint intrinsic is the number of bytes requested
130
NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
131
assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
132
break;
133
case TargetOpcode::STATEPOINT:
134
NumBytes = StatepointOpers(&MI).getNumPatchBytes();
135
assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
136
// No patch bytes means a normal call inst is emitted
137
if (NumBytes == 0)
138
NumBytes = 4;
139
break;
140
case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
141
// If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
142
// instructions are expanded to the specified number of NOPs. Otherwise,
143
// they are expanded to 36-byte XRay sleds.
144
NumBytes =
145
F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
146
break;
147
case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
148
case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
149
// An XRay sled can be 4 bytes of alignment plus a 32-byte block.
150
NumBytes = 36;
151
break;
152
case TargetOpcode::PATCHABLE_EVENT_CALL:
153
// EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
154
NumBytes = 24;
155
break;
156
157
case AArch64::SPACE:
158
NumBytes = MI.getOperand(1).getImm();
159
break;
160
case TargetOpcode::BUNDLE:
161
NumBytes = getInstBundleLength(MI);
162
break;
163
}
164
165
return NumBytes;
166
}
167
168
unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
169
unsigned Size = 0;
170
MachineBasicBlock::const_instr_iterator I = MI.getIterator();
171
MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
172
while (++I != E && I->isInsideBundle()) {
173
assert(!I->isBundle() && "No nested bundle!");
174
Size += getInstSizeInBytes(*I);
175
}
176
return Size;
177
}
178
179
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
180
SmallVectorImpl<MachineOperand> &Cond) {
181
// Block ends with fall-through condbranch.
182
switch (LastInst->getOpcode()) {
183
default:
184
llvm_unreachable("Unknown branch instruction?");
185
case AArch64::Bcc:
186
Target = LastInst->getOperand(1).getMBB();
187
Cond.push_back(LastInst->getOperand(0));
188
break;
189
case AArch64::CBZW:
190
case AArch64::CBZX:
191
case AArch64::CBNZW:
192
case AArch64::CBNZX:
193
Target = LastInst->getOperand(1).getMBB();
194
Cond.push_back(MachineOperand::CreateImm(-1));
195
Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
196
Cond.push_back(LastInst->getOperand(0));
197
break;
198
case AArch64::TBZW:
199
case AArch64::TBZX:
200
case AArch64::TBNZW:
201
case AArch64::TBNZX:
202
Target = LastInst->getOperand(2).getMBB();
203
Cond.push_back(MachineOperand::CreateImm(-1));
204
Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
205
Cond.push_back(LastInst->getOperand(0));
206
Cond.push_back(LastInst->getOperand(1));
207
}
208
}
209
210
static unsigned getBranchDisplacementBits(unsigned Opc) {
211
switch (Opc) {
212
default:
213
llvm_unreachable("unexpected opcode!");
214
case AArch64::B:
215
return BDisplacementBits;
216
case AArch64::TBNZW:
217
case AArch64::TBZW:
218
case AArch64::TBNZX:
219
case AArch64::TBZX:
220
return TBZDisplacementBits;
221
case AArch64::CBNZW:
222
case AArch64::CBZW:
223
case AArch64::CBNZX:
224
case AArch64::CBZX:
225
return CBZDisplacementBits;
226
case AArch64::Bcc:
227
return BCCDisplacementBits;
228
}
229
}
230
231
bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
232
int64_t BrOffset) const {
233
unsigned Bits = getBranchDisplacementBits(BranchOp);
234
assert(Bits >= 3 && "max branch displacement must be enough to jump"
235
"over conditional branch expansion");
236
return isIntN(Bits, BrOffset / 4);
237
}
238
239
MachineBasicBlock *
240
AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
241
switch (MI.getOpcode()) {
242
default:
243
llvm_unreachable("unexpected opcode!");
244
case AArch64::B:
245
return MI.getOperand(0).getMBB();
246
case AArch64::TBZW:
247
case AArch64::TBNZW:
248
case AArch64::TBZX:
249
case AArch64::TBNZX:
250
return MI.getOperand(2).getMBB();
251
case AArch64::CBZW:
252
case AArch64::CBNZW:
253
case AArch64::CBZX:
254
case AArch64::CBNZX:
255
case AArch64::Bcc:
256
return MI.getOperand(1).getMBB();
257
}
258
}
259
260
void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
261
MachineBasicBlock &NewDestBB,
262
MachineBasicBlock &RestoreBB,
263
const DebugLoc &DL,
264
int64_t BrOffset,
265
RegScavenger *RS) const {
266
assert(RS && "RegScavenger required for long branching");
267
assert(MBB.empty() &&
268
"new block should be inserted for expanding unconditional branch");
269
assert(MBB.pred_size() == 1);
270
assert(RestoreBB.empty() &&
271
"restore block should be inserted for restoring clobbered registers");
272
273
auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
274
// Offsets outside of the signed 33-bit range are not supported for ADRP +
275
// ADD.
276
if (!isInt<33>(BrOffset))
277
report_fatal_error(
278
"Branch offsets outside of the signed 33-bit range not supported");
279
280
BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
281
.addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
282
BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
283
.addReg(Reg)
284
.addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
285
.addImm(0);
286
BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
287
};
288
289
RS->enterBasicBlockEnd(MBB);
290
// If X16 is unused, we can rely on the linker to insert a range extension
291
// thunk if NewDestBB is out of range of a single B instruction.
292
constexpr Register Reg = AArch64::X16;
293
if (!RS->isRegUsed(Reg)) {
294
insertUnconditionalBranch(MBB, &NewDestBB, DL);
295
RS->setRegUsed(Reg);
296
return;
297
}
298
299
// If there's a free register and it's worth inflating the code size,
300
// manually insert the indirect branch.
301
Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
302
if (Scavenged != AArch64::NoRegister &&
303
MBB.getSectionID() == MBBSectionID::ColdSectionID) {
304
buildIndirectBranch(Scavenged, NewDestBB);
305
RS->setRegUsed(Scavenged);
306
return;
307
}
308
309
// Note: Spilling X16 briefly moves the stack pointer, making it incompatible
310
// with red zones.
311
AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
312
if (!AFI || AFI->hasRedZone().value_or(true))
313
report_fatal_error(
314
"Unable to insert indirect branch inside function that has red zone");
315
316
// Otherwise, spill X16 and defer range extension to the linker.
317
BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
318
.addReg(AArch64::SP, RegState::Define)
319
.addReg(Reg)
320
.addReg(AArch64::SP)
321
.addImm(-16);
322
323
BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
324
325
BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
326
.addReg(AArch64::SP, RegState::Define)
327
.addReg(Reg, RegState::Define)
328
.addReg(AArch64::SP)
329
.addImm(16);
330
}
331
332
// Branch analysis.
333
bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
334
MachineBasicBlock *&TBB,
335
MachineBasicBlock *&FBB,
336
SmallVectorImpl<MachineOperand> &Cond,
337
bool AllowModify) const {
338
// If the block has no terminators, it just falls into the block after it.
339
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
340
if (I == MBB.end())
341
return false;
342
343
// Skip over SpeculationBarrierEndBB terminators
344
if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
345
I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
346
--I;
347
}
348
349
if (!isUnpredicatedTerminator(*I))
350
return false;
351
352
// Get the last instruction in the block.
353
MachineInstr *LastInst = &*I;
354
355
// If there is only one terminator instruction, process it.
356
unsigned LastOpc = LastInst->getOpcode();
357
if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
358
if (isUncondBranchOpcode(LastOpc)) {
359
TBB = LastInst->getOperand(0).getMBB();
360
return false;
361
}
362
if (isCondBranchOpcode(LastOpc)) {
363
// Block ends with fall-through condbranch.
364
parseCondBranch(LastInst, TBB, Cond);
365
return false;
366
}
367
return true; // Can't handle indirect branch.
368
}
369
370
// Get the instruction before it if it is a terminator.
371
MachineInstr *SecondLastInst = &*I;
372
unsigned SecondLastOpc = SecondLastInst->getOpcode();
373
374
// If AllowModify is true and the block ends with two or more unconditional
375
// branches, delete all but the first unconditional branch.
376
if (AllowModify && isUncondBranchOpcode(LastOpc)) {
377
while (isUncondBranchOpcode(SecondLastOpc)) {
378
LastInst->eraseFromParent();
379
LastInst = SecondLastInst;
380
LastOpc = LastInst->getOpcode();
381
if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
382
// Return now the only terminator is an unconditional branch.
383
TBB = LastInst->getOperand(0).getMBB();
384
return false;
385
}
386
SecondLastInst = &*I;
387
SecondLastOpc = SecondLastInst->getOpcode();
388
}
389
}
390
391
// If we're allowed to modify and the block ends in a unconditional branch
392
// which could simply fallthrough, remove the branch. (Note: This case only
393
// matters when we can't understand the whole sequence, otherwise it's also
394
// handled by BranchFolding.cpp.)
395
if (AllowModify && isUncondBranchOpcode(LastOpc) &&
396
MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
397
LastInst->eraseFromParent();
398
LastInst = SecondLastInst;
399
LastOpc = LastInst->getOpcode();
400
if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
401
assert(!isUncondBranchOpcode(LastOpc) &&
402
"unreachable unconditional branches removed above");
403
404
if (isCondBranchOpcode(LastOpc)) {
405
// Block ends with fall-through condbranch.
406
parseCondBranch(LastInst, TBB, Cond);
407
return false;
408
}
409
return true; // Can't handle indirect branch.
410
}
411
SecondLastInst = &*I;
412
SecondLastOpc = SecondLastInst->getOpcode();
413
}
414
415
// If there are three terminators, we don't know what sort of block this is.
416
if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
417
return true;
418
419
// If the block ends with a B and a Bcc, handle it.
420
if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
421
parseCondBranch(SecondLastInst, TBB, Cond);
422
FBB = LastInst->getOperand(0).getMBB();
423
return false;
424
}
425
426
// If the block ends with two unconditional branches, handle it. The second
427
// one is not executed, so remove it.
428
if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
429
TBB = SecondLastInst->getOperand(0).getMBB();
430
I = LastInst;
431
if (AllowModify)
432
I->eraseFromParent();
433
return false;
434
}
435
436
// ...likewise if it ends with an indirect branch followed by an unconditional
437
// branch.
438
if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
439
I = LastInst;
440
if (AllowModify)
441
I->eraseFromParent();
442
return true;
443
}
444
445
// Otherwise, can't handle this.
446
return true;
447
}
448
449
bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
450
MachineBranchPredicate &MBP,
451
bool AllowModify) const {
452
// For the moment, handle only a block which ends with a cb(n)zx followed by
453
// a fallthrough. Why this? Because it is a common form.
454
// TODO: Should we handle b.cc?
455
456
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
457
if (I == MBB.end())
458
return true;
459
460
// Skip over SpeculationBarrierEndBB terminators
461
if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
462
I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
463
--I;
464
}
465
466
if (!isUnpredicatedTerminator(*I))
467
return true;
468
469
// Get the last instruction in the block.
470
MachineInstr *LastInst = &*I;
471
unsigned LastOpc = LastInst->getOpcode();
472
if (!isCondBranchOpcode(LastOpc))
473
return true;
474
475
switch (LastOpc) {
476
default:
477
return true;
478
case AArch64::CBZW:
479
case AArch64::CBZX:
480
case AArch64::CBNZW:
481
case AArch64::CBNZX:
482
break;
483
};
484
485
MBP.TrueDest = LastInst->getOperand(1).getMBB();
486
assert(MBP.TrueDest && "expected!");
487
MBP.FalseDest = MBB.getNextNode();
488
489
MBP.ConditionDef = nullptr;
490
MBP.SingleUseCondition = false;
491
492
MBP.LHS = LastInst->getOperand(0);
493
MBP.RHS = MachineOperand::CreateImm(0);
494
MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
495
: MachineBranchPredicate::PRED_EQ;
496
return false;
497
}
498
499
bool AArch64InstrInfo::reverseBranchCondition(
500
SmallVectorImpl<MachineOperand> &Cond) const {
501
if (Cond[0].getImm() != -1) {
502
// Regular Bcc
503
AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
504
Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
505
} else {
506
// Folded compare-and-branch
507
switch (Cond[1].getImm()) {
508
default:
509
llvm_unreachable("Unknown conditional branch!");
510
case AArch64::CBZW:
511
Cond[1].setImm(AArch64::CBNZW);
512
break;
513
case AArch64::CBNZW:
514
Cond[1].setImm(AArch64::CBZW);
515
break;
516
case AArch64::CBZX:
517
Cond[1].setImm(AArch64::CBNZX);
518
break;
519
case AArch64::CBNZX:
520
Cond[1].setImm(AArch64::CBZX);
521
break;
522
case AArch64::TBZW:
523
Cond[1].setImm(AArch64::TBNZW);
524
break;
525
case AArch64::TBNZW:
526
Cond[1].setImm(AArch64::TBZW);
527
break;
528
case AArch64::TBZX:
529
Cond[1].setImm(AArch64::TBNZX);
530
break;
531
case AArch64::TBNZX:
532
Cond[1].setImm(AArch64::TBZX);
533
break;
534
}
535
}
536
537
return false;
538
}
539
540
unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
541
int *BytesRemoved) const {
542
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
543
if (I == MBB.end())
544
return 0;
545
546
if (!isUncondBranchOpcode(I->getOpcode()) &&
547
!isCondBranchOpcode(I->getOpcode()))
548
return 0;
549
550
// Remove the branch.
551
I->eraseFromParent();
552
553
I = MBB.end();
554
555
if (I == MBB.begin()) {
556
if (BytesRemoved)
557
*BytesRemoved = 4;
558
return 1;
559
}
560
--I;
561
if (!isCondBranchOpcode(I->getOpcode())) {
562
if (BytesRemoved)
563
*BytesRemoved = 4;
564
return 1;
565
}
566
567
// Remove the branch.
568
I->eraseFromParent();
569
if (BytesRemoved)
570
*BytesRemoved = 8;
571
572
return 2;
573
}
574
575
void AArch64InstrInfo::instantiateCondBranch(
576
MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
577
ArrayRef<MachineOperand> Cond) const {
578
if (Cond[0].getImm() != -1) {
579
// Regular Bcc
580
BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
581
} else {
582
// Folded compare-and-branch
583
// Note that we use addOperand instead of addReg to keep the flags.
584
const MachineInstrBuilder MIB =
585
BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
586
if (Cond.size() > 3)
587
MIB.addImm(Cond[3].getImm());
588
MIB.addMBB(TBB);
589
}
590
}
591
592
unsigned AArch64InstrInfo::insertBranch(
593
MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
594
ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
595
// Shouldn't be a fall through.
596
assert(TBB && "insertBranch must not be told to insert a fallthrough");
597
598
if (!FBB) {
599
if (Cond.empty()) // Unconditional branch?
600
BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
601
else
602
instantiateCondBranch(MBB, DL, TBB, Cond);
603
604
if (BytesAdded)
605
*BytesAdded = 4;
606
607
return 1;
608
}
609
610
// Two-way conditional branch.
611
instantiateCondBranch(MBB, DL, TBB, Cond);
612
BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
613
614
if (BytesAdded)
615
*BytesAdded = 8;
616
617
return 2;
618
}
619
620
// Find the original register that VReg is copied from.
621
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
622
while (Register::isVirtualRegister(VReg)) {
623
const MachineInstr *DefMI = MRI.getVRegDef(VReg);
624
if (!DefMI->isFullCopy())
625
return VReg;
626
VReg = DefMI->getOperand(1).getReg();
627
}
628
return VReg;
629
}
630
631
// Determine if VReg is defined by an instruction that can be folded into a
632
// csel instruction. If so, return the folded opcode, and the replacement
633
// register.
634
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
635
unsigned *NewVReg = nullptr) {
636
VReg = removeCopies(MRI, VReg);
637
if (!Register::isVirtualRegister(VReg))
638
return 0;
639
640
bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
641
const MachineInstr *DefMI = MRI.getVRegDef(VReg);
642
unsigned Opc = 0;
643
unsigned SrcOpNum = 0;
644
switch (DefMI->getOpcode()) {
645
case AArch64::ADDSXri:
646
case AArch64::ADDSWri:
647
// if NZCV is used, do not fold.
648
if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
649
true) == -1)
650
return 0;
651
// fall-through to ADDXri and ADDWri.
652
[[fallthrough]];
653
case AArch64::ADDXri:
654
case AArch64::ADDWri:
655
// add x, 1 -> csinc.
656
if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
657
DefMI->getOperand(3).getImm() != 0)
658
return 0;
659
SrcOpNum = 1;
660
Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
661
break;
662
663
case AArch64::ORNXrr:
664
case AArch64::ORNWrr: {
665
// not x -> csinv, represented as orn dst, xzr, src.
666
unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
667
if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
668
return 0;
669
SrcOpNum = 2;
670
Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
671
break;
672
}
673
674
case AArch64::SUBSXrr:
675
case AArch64::SUBSWrr:
676
// if NZCV is used, do not fold.
677
if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
678
true) == -1)
679
return 0;
680
// fall-through to SUBXrr and SUBWrr.
681
[[fallthrough]];
682
case AArch64::SUBXrr:
683
case AArch64::SUBWrr: {
684
// neg x -> csneg, represented as sub dst, xzr, src.
685
unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
686
if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
687
return 0;
688
SrcOpNum = 2;
689
Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
690
break;
691
}
692
default:
693
return 0;
694
}
695
assert(Opc && SrcOpNum && "Missing parameters");
696
697
if (NewVReg)
698
*NewVReg = DefMI->getOperand(SrcOpNum).getReg();
699
return Opc;
700
}
701
702
bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
703
ArrayRef<MachineOperand> Cond,
704
Register DstReg, Register TrueReg,
705
Register FalseReg, int &CondCycles,
706
int &TrueCycles,
707
int &FalseCycles) const {
708
// Check register classes.
709
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
710
const TargetRegisterClass *RC =
711
RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
712
if (!RC)
713
return false;
714
715
// Also need to check the dest regclass, in case we're trying to optimize
716
// something like:
717
// %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
718
if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
719
return false;
720
721
// Expanding cbz/tbz requires an extra cycle of latency on the condition.
722
unsigned ExtraCondLat = Cond.size() != 1;
723
724
// GPRs are handled by csel.
725
// FIXME: Fold in x+1, -x, and ~x when applicable.
726
if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
727
AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
728
// Single-cycle csel, csinc, csinv, and csneg.
729
CondCycles = 1 + ExtraCondLat;
730
TrueCycles = FalseCycles = 1;
731
if (canFoldIntoCSel(MRI, TrueReg))
732
TrueCycles = 0;
733
else if (canFoldIntoCSel(MRI, FalseReg))
734
FalseCycles = 0;
735
return true;
736
}
737
738
// Scalar floating point is handled by fcsel.
739
// FIXME: Form fabs, fmin, and fmax when applicable.
740
if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
741
AArch64::FPR32RegClass.hasSubClassEq(RC)) {
742
CondCycles = 5 + ExtraCondLat;
743
TrueCycles = FalseCycles = 2;
744
return true;
745
}
746
747
// Can't do vectors.
748
return false;
749
}
750
751
void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
752
MachineBasicBlock::iterator I,
753
const DebugLoc &DL, Register DstReg,
754
ArrayRef<MachineOperand> Cond,
755
Register TrueReg, Register FalseReg) const {
756
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
757
758
// Parse the condition code, see parseCondBranch() above.
759
AArch64CC::CondCode CC;
760
switch (Cond.size()) {
761
default:
762
llvm_unreachable("Unknown condition opcode in Cond");
763
case 1: // b.cc
764
CC = AArch64CC::CondCode(Cond[0].getImm());
765
break;
766
case 3: { // cbz/cbnz
767
// We must insert a compare against 0.
768
bool Is64Bit;
769
switch (Cond[1].getImm()) {
770
default:
771
llvm_unreachable("Unknown branch opcode in Cond");
772
case AArch64::CBZW:
773
Is64Bit = false;
774
CC = AArch64CC::EQ;
775
break;
776
case AArch64::CBZX:
777
Is64Bit = true;
778
CC = AArch64CC::EQ;
779
break;
780
case AArch64::CBNZW:
781
Is64Bit = false;
782
CC = AArch64CC::NE;
783
break;
784
case AArch64::CBNZX:
785
Is64Bit = true;
786
CC = AArch64CC::NE;
787
break;
788
}
789
Register SrcReg = Cond[2].getReg();
790
if (Is64Bit) {
791
// cmp reg, #0 is actually subs xzr, reg, #0.
792
MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
793
BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
794
.addReg(SrcReg)
795
.addImm(0)
796
.addImm(0);
797
} else {
798
MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
799
BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
800
.addReg(SrcReg)
801
.addImm(0)
802
.addImm(0);
803
}
804
break;
805
}
806
case 4: { // tbz/tbnz
807
// We must insert a tst instruction.
808
switch (Cond[1].getImm()) {
809
default:
810
llvm_unreachable("Unknown branch opcode in Cond");
811
case AArch64::TBZW:
812
case AArch64::TBZX:
813
CC = AArch64CC::EQ;
814
break;
815
case AArch64::TBNZW:
816
case AArch64::TBNZX:
817
CC = AArch64CC::NE;
818
break;
819
}
820
// cmp reg, #foo is actually ands xzr, reg, #1<<foo.
821
if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
822
BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
823
.addReg(Cond[2].getReg())
824
.addImm(
825
AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
826
else
827
BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
828
.addReg(Cond[2].getReg())
829
.addImm(
830
AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
831
break;
832
}
833
}
834
835
unsigned Opc = 0;
836
const TargetRegisterClass *RC = nullptr;
837
bool TryFold = false;
838
if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
839
RC = &AArch64::GPR64RegClass;
840
Opc = AArch64::CSELXr;
841
TryFold = true;
842
} else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
843
RC = &AArch64::GPR32RegClass;
844
Opc = AArch64::CSELWr;
845
TryFold = true;
846
} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
847
RC = &AArch64::FPR64RegClass;
848
Opc = AArch64::FCSELDrrr;
849
} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
850
RC = &AArch64::FPR32RegClass;
851
Opc = AArch64::FCSELSrrr;
852
}
853
assert(RC && "Unsupported regclass");
854
855
// Try folding simple instructions into the csel.
856
if (TryFold) {
857
unsigned NewVReg = 0;
858
unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
859
if (FoldedOpc) {
860
// The folded opcodes csinc, csinc and csneg apply the operation to
861
// FalseReg, so we need to invert the condition.
862
CC = AArch64CC::getInvertedCondCode(CC);
863
TrueReg = FalseReg;
864
} else
865
FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
866
867
// Fold the operation. Leave any dead instructions for DCE to clean up.
868
if (FoldedOpc) {
869
FalseReg = NewVReg;
870
Opc = FoldedOpc;
871
// The extends the live range of NewVReg.
872
MRI.clearKillFlags(NewVReg);
873
}
874
}
875
876
// Pull all virtual register into the appropriate class.
877
MRI.constrainRegClass(TrueReg, RC);
878
MRI.constrainRegClass(FalseReg, RC);
879
880
// Insert the csel.
881
BuildMI(MBB, I, DL, get(Opc), DstReg)
882
.addReg(TrueReg)
883
.addReg(FalseReg)
884
.addImm(CC);
885
}
886
887
// Return true if Imm can be loaded into a register by a "cheap" sequence of
888
// instructions. For now, "cheap" means at most two instructions.
889
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
890
if (BitSize == 32)
891
return true;
892
893
assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
894
uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
895
SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
896
AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
897
898
return Is.size() <= 2;
899
}
900
901
// FIXME: this implementation should be micro-architecture dependent, so a
902
// micro-architecture target hook should be introduced here in future.
903
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
904
if (Subtarget.hasExynosCheapAsMoveHandling()) {
905
if (isExynosCheapAsMove(MI))
906
return true;
907
return MI.isAsCheapAsAMove();
908
}
909
910
switch (MI.getOpcode()) {
911
default:
912
return MI.isAsCheapAsAMove();
913
914
case AArch64::ADDWrs:
915
case AArch64::ADDXrs:
916
case AArch64::SUBWrs:
917
case AArch64::SUBXrs:
918
return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
919
920
// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
921
// ORRXri, it is as cheap as MOV.
922
// Likewise if it can be expanded to MOVZ/MOVN/MOVK.
923
case AArch64::MOVi32imm:
924
return isCheapImmediate(MI, 32);
925
case AArch64::MOVi64imm:
926
return isCheapImmediate(MI, 64);
927
}
928
}
929
930
bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
931
switch (MI.getOpcode()) {
932
default:
933
return false;
934
935
case AArch64::ADDWrs:
936
case AArch64::ADDXrs:
937
case AArch64::ADDSWrs:
938
case AArch64::ADDSXrs: {
939
unsigned Imm = MI.getOperand(3).getImm();
940
unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
941
if (ShiftVal == 0)
942
return true;
943
return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
944
}
945
946
case AArch64::ADDWrx:
947
case AArch64::ADDXrx:
948
case AArch64::ADDXrx64:
949
case AArch64::ADDSWrx:
950
case AArch64::ADDSXrx:
951
case AArch64::ADDSXrx64: {
952
unsigned Imm = MI.getOperand(3).getImm();
953
switch (AArch64_AM::getArithExtendType(Imm)) {
954
default:
955
return false;
956
case AArch64_AM::UXTB:
957
case AArch64_AM::UXTH:
958
case AArch64_AM::UXTW:
959
case AArch64_AM::UXTX:
960
return AArch64_AM::getArithShiftValue(Imm) <= 4;
961
}
962
}
963
964
case AArch64::SUBWrs:
965
case AArch64::SUBSWrs: {
966
unsigned Imm = MI.getOperand(3).getImm();
967
unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
968
return ShiftVal == 0 ||
969
(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
970
}
971
972
case AArch64::SUBXrs:
973
case AArch64::SUBSXrs: {
974
unsigned Imm = MI.getOperand(3).getImm();
975
unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
976
return ShiftVal == 0 ||
977
(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
978
}
979
980
case AArch64::SUBWrx:
981
case AArch64::SUBXrx:
982
case AArch64::SUBXrx64:
983
case AArch64::SUBSWrx:
984
case AArch64::SUBSXrx:
985
case AArch64::SUBSXrx64: {
986
unsigned Imm = MI.getOperand(3).getImm();
987
switch (AArch64_AM::getArithExtendType(Imm)) {
988
default:
989
return false;
990
case AArch64_AM::UXTB:
991
case AArch64_AM::UXTH:
992
case AArch64_AM::UXTW:
993
case AArch64_AM::UXTX:
994
return AArch64_AM::getArithShiftValue(Imm) == 0;
995
}
996
}
997
998
case AArch64::LDRBBroW:
999
case AArch64::LDRBBroX:
1000
case AArch64::LDRBroW:
1001
case AArch64::LDRBroX:
1002
case AArch64::LDRDroW:
1003
case AArch64::LDRDroX:
1004
case AArch64::LDRHHroW:
1005
case AArch64::LDRHHroX:
1006
case AArch64::LDRHroW:
1007
case AArch64::LDRHroX:
1008
case AArch64::LDRQroW:
1009
case AArch64::LDRQroX:
1010
case AArch64::LDRSBWroW:
1011
case AArch64::LDRSBWroX:
1012
case AArch64::LDRSBXroW:
1013
case AArch64::LDRSBXroX:
1014
case AArch64::LDRSHWroW:
1015
case AArch64::LDRSHWroX:
1016
case AArch64::LDRSHXroW:
1017
case AArch64::LDRSHXroX:
1018
case AArch64::LDRSWroW:
1019
case AArch64::LDRSWroX:
1020
case AArch64::LDRSroW:
1021
case AArch64::LDRSroX:
1022
case AArch64::LDRWroW:
1023
case AArch64::LDRWroX:
1024
case AArch64::LDRXroW:
1025
case AArch64::LDRXroX:
1026
case AArch64::PRFMroW:
1027
case AArch64::PRFMroX:
1028
case AArch64::STRBBroW:
1029
case AArch64::STRBBroX:
1030
case AArch64::STRBroW:
1031
case AArch64::STRBroX:
1032
case AArch64::STRDroW:
1033
case AArch64::STRDroX:
1034
case AArch64::STRHHroW:
1035
case AArch64::STRHHroX:
1036
case AArch64::STRHroW:
1037
case AArch64::STRHroX:
1038
case AArch64::STRQroW:
1039
case AArch64::STRQroX:
1040
case AArch64::STRSroW:
1041
case AArch64::STRSroX:
1042
case AArch64::STRWroW:
1043
case AArch64::STRWroX:
1044
case AArch64::STRXroW:
1045
case AArch64::STRXroX: {
1046
unsigned IsSigned = MI.getOperand(3).getImm();
1047
return !IsSigned;
1048
}
1049
}
1050
}
1051
1052
bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1053
unsigned Opc = MI.getOpcode();
1054
switch (Opc) {
1055
default:
1056
return false;
1057
case AArch64::SEH_StackAlloc:
1058
case AArch64::SEH_SaveFPLR:
1059
case AArch64::SEH_SaveFPLR_X:
1060
case AArch64::SEH_SaveReg:
1061
case AArch64::SEH_SaveReg_X:
1062
case AArch64::SEH_SaveRegP:
1063
case AArch64::SEH_SaveRegP_X:
1064
case AArch64::SEH_SaveFReg:
1065
case AArch64::SEH_SaveFReg_X:
1066
case AArch64::SEH_SaveFRegP:
1067
case AArch64::SEH_SaveFRegP_X:
1068
case AArch64::SEH_SetFP:
1069
case AArch64::SEH_AddFP:
1070
case AArch64::SEH_Nop:
1071
case AArch64::SEH_PrologEnd:
1072
case AArch64::SEH_EpilogStart:
1073
case AArch64::SEH_EpilogEnd:
1074
case AArch64::SEH_PACSignLR:
1075
case AArch64::SEH_SaveAnyRegQP:
1076
case AArch64::SEH_SaveAnyRegQPX:
1077
return true;
1078
}
1079
}
1080
1081
bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1082
Register &SrcReg, Register &DstReg,
1083
unsigned &SubIdx) const {
1084
switch (MI.getOpcode()) {
1085
default:
1086
return false;
1087
case AArch64::SBFMXri: // aka sxtw
1088
case AArch64::UBFMXri: // aka uxtw
1089
// Check for the 32 -> 64 bit extension case, these instructions can do
1090
// much more.
1091
if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1092
return false;
1093
// This is a signed or unsigned 32 -> 64 bit extension.
1094
SrcReg = MI.getOperand(1).getReg();
1095
DstReg = MI.getOperand(0).getReg();
1096
SubIdx = AArch64::sub_32;
1097
return true;
1098
}
1099
}
1100
1101
bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1102
const MachineInstr &MIa, const MachineInstr &MIb) const {
1103
const TargetRegisterInfo *TRI = &getRegisterInfo();
1104
const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1105
int64_t OffsetA = 0, OffsetB = 0;
1106
TypeSize WidthA(0, false), WidthB(0, false);
1107
bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1108
1109
assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1110
assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1111
1112
if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1113
MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1114
return false;
1115
1116
// Retrieve the base, offset from the base and width. Width
1117
// is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1118
// base are identical, and the offset of a lower memory access +
1119
// the width doesn't overlap the offset of a higher memory access,
1120
// then the memory accesses are different.
1121
// If OffsetAIsScalable and OffsetBIsScalable are both true, they
1122
// are assumed to have the same scale (vscale).
1123
if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1124
WidthA, TRI) &&
1125
getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1126
WidthB, TRI)) {
1127
if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1128
OffsetAIsScalable == OffsetBIsScalable) {
1129
int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1130
int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1131
TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1132
if (LowWidth.isScalable() == OffsetAIsScalable &&
1133
LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1134
return true;
1135
}
1136
}
1137
return false;
1138
}
1139
1140
bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1141
const MachineBasicBlock *MBB,
1142
const MachineFunction &MF) const {
1143
if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1144
return true;
1145
1146
// Do not move an instruction that can be recognized as a branch target.
1147
if (hasBTISemantics(MI))
1148
return true;
1149
1150
switch (MI.getOpcode()) {
1151
case AArch64::HINT:
1152
// CSDB hints are scheduling barriers.
1153
if (MI.getOperand(0).getImm() == 0x14)
1154
return true;
1155
break;
1156
case AArch64::DSB:
1157
case AArch64::ISB:
1158
// DSB and ISB also are scheduling barriers.
1159
return true;
1160
case AArch64::MSRpstatesvcrImm1:
1161
// SMSTART and SMSTOP are also scheduling barriers.
1162
return true;
1163
default:;
1164
}
1165
if (isSEHInstruction(MI))
1166
return true;
1167
auto Next = std::next(MI.getIterator());
1168
return Next != MBB->end() && Next->isCFIInstruction();
1169
}
1170
1171
/// analyzeCompare - For a comparison instruction, return the source registers
1172
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1173
/// Return true if the comparison instruction can be analyzed.
1174
bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1175
Register &SrcReg2, int64_t &CmpMask,
1176
int64_t &CmpValue) const {
1177
// The first operand can be a frame index where we'd normally expect a
1178
// register.
1179
assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1180
if (!MI.getOperand(1).isReg())
1181
return false;
1182
1183
switch (MI.getOpcode()) {
1184
default:
1185
break;
1186
case AArch64::PTEST_PP:
1187
case AArch64::PTEST_PP_ANY:
1188
SrcReg = MI.getOperand(0).getReg();
1189
SrcReg2 = MI.getOperand(1).getReg();
1190
// Not sure about the mask and value for now...
1191
CmpMask = ~0;
1192
CmpValue = 0;
1193
return true;
1194
case AArch64::SUBSWrr:
1195
case AArch64::SUBSWrs:
1196
case AArch64::SUBSWrx:
1197
case AArch64::SUBSXrr:
1198
case AArch64::SUBSXrs:
1199
case AArch64::SUBSXrx:
1200
case AArch64::ADDSWrr:
1201
case AArch64::ADDSWrs:
1202
case AArch64::ADDSWrx:
1203
case AArch64::ADDSXrr:
1204
case AArch64::ADDSXrs:
1205
case AArch64::ADDSXrx:
1206
// Replace SUBSWrr with SUBWrr if NZCV is not used.
1207
SrcReg = MI.getOperand(1).getReg();
1208
SrcReg2 = MI.getOperand(2).getReg();
1209
CmpMask = ~0;
1210
CmpValue = 0;
1211
return true;
1212
case AArch64::SUBSWri:
1213
case AArch64::ADDSWri:
1214
case AArch64::SUBSXri:
1215
case AArch64::ADDSXri:
1216
SrcReg = MI.getOperand(1).getReg();
1217
SrcReg2 = 0;
1218
CmpMask = ~0;
1219
CmpValue = MI.getOperand(2).getImm();
1220
return true;
1221
case AArch64::ANDSWri:
1222
case AArch64::ANDSXri:
1223
// ANDS does not use the same encoding scheme as the others xxxS
1224
// instructions.
1225
SrcReg = MI.getOperand(1).getReg();
1226
SrcReg2 = 0;
1227
CmpMask = ~0;
1228
CmpValue = AArch64_AM::decodeLogicalImmediate(
1229
MI.getOperand(2).getImm(),
1230
MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1231
return true;
1232
}
1233
1234
return false;
1235
}
1236
1237
static bool UpdateOperandRegClass(MachineInstr &Instr) {
1238
MachineBasicBlock *MBB = Instr.getParent();
1239
assert(MBB && "Can't get MachineBasicBlock here");
1240
MachineFunction *MF = MBB->getParent();
1241
assert(MF && "Can't get MachineFunction here");
1242
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1243
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1244
MachineRegisterInfo *MRI = &MF->getRegInfo();
1245
1246
for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1247
++OpIdx) {
1248
MachineOperand &MO = Instr.getOperand(OpIdx);
1249
const TargetRegisterClass *OpRegCstraints =
1250
Instr.getRegClassConstraint(OpIdx, TII, TRI);
1251
1252
// If there's no constraint, there's nothing to do.
1253
if (!OpRegCstraints)
1254
continue;
1255
// If the operand is a frame index, there's nothing to do here.
1256
// A frame index operand will resolve correctly during PEI.
1257
if (MO.isFI())
1258
continue;
1259
1260
assert(MO.isReg() &&
1261
"Operand has register constraints without being a register!");
1262
1263
Register Reg = MO.getReg();
1264
if (Reg.isPhysical()) {
1265
if (!OpRegCstraints->contains(Reg))
1266
return false;
1267
} else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1268
!MRI->constrainRegClass(Reg, OpRegCstraints))
1269
return false;
1270
}
1271
1272
return true;
1273
}
1274
1275
/// Return the opcode that does not set flags when possible - otherwise
1276
/// return the original opcode. The caller is responsible to do the actual
1277
/// substitution and legality checking.
1278
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1279
// Don't convert all compare instructions, because for some the zero register
1280
// encoding becomes the sp register.
1281
bool MIDefinesZeroReg = false;
1282
if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1283
MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1284
MIDefinesZeroReg = true;
1285
1286
switch (MI.getOpcode()) {
1287
default:
1288
return MI.getOpcode();
1289
case AArch64::ADDSWrr:
1290
return AArch64::ADDWrr;
1291
case AArch64::ADDSWri:
1292
return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1293
case AArch64::ADDSWrs:
1294
return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1295
case AArch64::ADDSWrx:
1296
return AArch64::ADDWrx;
1297
case AArch64::ADDSXrr:
1298
return AArch64::ADDXrr;
1299
case AArch64::ADDSXri:
1300
return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1301
case AArch64::ADDSXrs:
1302
return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1303
case AArch64::ADDSXrx:
1304
return AArch64::ADDXrx;
1305
case AArch64::SUBSWrr:
1306
return AArch64::SUBWrr;
1307
case AArch64::SUBSWri:
1308
return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1309
case AArch64::SUBSWrs:
1310
return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1311
case AArch64::SUBSWrx:
1312
return AArch64::SUBWrx;
1313
case AArch64::SUBSXrr:
1314
return AArch64::SUBXrr;
1315
case AArch64::SUBSXri:
1316
return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1317
case AArch64::SUBSXrs:
1318
return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1319
case AArch64::SUBSXrx:
1320
return AArch64::SUBXrx;
1321
}
1322
}
1323
1324
enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1325
1326
/// True when condition flags are accessed (either by writing or reading)
1327
/// on the instruction trace starting at From and ending at To.
1328
///
1329
/// Note: If From and To are from different blocks it's assumed CC are accessed
1330
/// on the path.
1331
static bool areCFlagsAccessedBetweenInstrs(
1332
MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1333
const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1334
// Early exit if To is at the beginning of the BB.
1335
if (To == To->getParent()->begin())
1336
return true;
1337
1338
// Check whether the instructions are in the same basic block
1339
// If not, assume the condition flags might get modified somewhere.
1340
if (To->getParent() != From->getParent())
1341
return true;
1342
1343
// From must be above To.
1344
assert(std::any_of(
1345
++To.getReverse(), To->getParent()->rend(),
1346
[From](MachineInstr &MI) { return MI.getIterator() == From; }));
1347
1348
// We iterate backward starting at \p To until we hit \p From.
1349
for (const MachineInstr &Instr :
1350
instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1351
if (((AccessToCheck & AK_Write) &&
1352
Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1353
((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1354
return true;
1355
}
1356
return false;
1357
}
1358
1359
std::optional<unsigned>
1360
AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1361
MachineInstr *Pred,
1362
const MachineRegisterInfo *MRI) const {
1363
unsigned MaskOpcode = Mask->getOpcode();
1364
unsigned PredOpcode = Pred->getOpcode();
1365
bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1366
bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1367
1368
if (PredIsWhileLike) {
1369
// For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1370
// instruction and the condition is "any" since WHILcc does an implicit
1371
// PTEST(ALL, PG) check and PG is always a subset of ALL.
1372
if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1373
return PredOpcode;
1374
1375
// For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1376
// redundant since WHILE performs an implicit PTEST with an all active
1377
// mask.
1378
if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1379
getElementSizeForOpcode(MaskOpcode) ==
1380
getElementSizeForOpcode(PredOpcode))
1381
return PredOpcode;
1382
1383
return {};
1384
}
1385
1386
if (PredIsPTestLike) {
1387
// For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1388
// instruction that sets the flags as PTEST would and the condition is
1389
// "any" since PG is always a subset of the governing predicate of the
1390
// ptest-like instruction.
1391
if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1392
return PredOpcode;
1393
1394
// For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1395
// the element size matches and either the PTEST_LIKE instruction uses
1396
// the same all active mask or the condition is "any".
1397
if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1398
getElementSizeForOpcode(MaskOpcode) ==
1399
getElementSizeForOpcode(PredOpcode)) {
1400
auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1401
if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1402
return PredOpcode;
1403
}
1404
1405
// For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1406
// flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1407
// on 8-bit predicates like the PTEST. Otherwise, for instructions like
1408
// compare that also support 16/32/64-bit predicates, the implicit PTEST
1409
// performed by the compare could consider fewer lanes for these element
1410
// sizes.
1411
//
1412
// For example, consider
1413
//
1414
// ptrue p0.b ; P0=1111-1111-1111-1111
1415
// index z0.s, #0, #1 ; Z0=<0,1,2,3>
1416
// index z1.s, #1, #1 ; Z1=<1,2,3,4>
1417
// cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1418
// ; ^ last active
1419
// ptest p0, p1.b ; P1=0001-0001-0001-0001
1420
// ; ^ last active
1421
//
1422
// where the compare generates a canonical all active 32-bit predicate
1423
// (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1424
// active flag, whereas the PTEST instruction with the same mask doesn't.
1425
// For PTEST_ANY this doesn't apply as the flags in this case would be
1426
// identical regardless of element size.
1427
auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1428
uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1429
if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1430
PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1431
return PredOpcode;
1432
1433
return {};
1434
}
1435
1436
// If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1437
// opcode so the PTEST becomes redundant.
1438
switch (PredOpcode) {
1439
case AArch64::AND_PPzPP:
1440
case AArch64::BIC_PPzPP:
1441
case AArch64::EOR_PPzPP:
1442
case AArch64::NAND_PPzPP:
1443
case AArch64::NOR_PPzPP:
1444
case AArch64::ORN_PPzPP:
1445
case AArch64::ORR_PPzPP:
1446
case AArch64::BRKA_PPzP:
1447
case AArch64::BRKPA_PPzPP:
1448
case AArch64::BRKB_PPzP:
1449
case AArch64::BRKPB_PPzPP:
1450
case AArch64::RDFFR_PPz: {
1451
// Check to see if our mask is the same. If not the resulting flag bits
1452
// may be different and we can't remove the ptest.
1453
auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1454
if (Mask != PredMask)
1455
return {};
1456
break;
1457
}
1458
case AArch64::BRKN_PPzP: {
1459
// BRKN uses an all active implicit mask to set flags unlike the other
1460
// flag-setting instructions.
1461
// PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1462
if ((MaskOpcode != AArch64::PTRUE_B) ||
1463
(Mask->getOperand(1).getImm() != 31))
1464
return {};
1465
break;
1466
}
1467
case AArch64::PTRUE_B:
1468
// PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1469
break;
1470
default:
1471
// Bail out if we don't recognize the input
1472
return {};
1473
}
1474
1475
return convertToFlagSettingOpc(PredOpcode);
1476
}
1477
1478
/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1479
/// operation which could set the flags in an identical manner
1480
bool AArch64InstrInfo::optimizePTestInstr(
1481
MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1482
const MachineRegisterInfo *MRI) const {
1483
auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1484
auto *Pred = MRI->getUniqueVRegDef(PredReg);
1485
unsigned PredOpcode = Pred->getOpcode();
1486
auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1487
if (!NewOp)
1488
return false;
1489
1490
const TargetRegisterInfo *TRI = &getRegisterInfo();
1491
1492
// If another instruction between Pred and PTest accesses flags, don't remove
1493
// the ptest or update the earlier instruction to modify them.
1494
if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1495
return false;
1496
1497
// If we pass all the checks, it's safe to remove the PTEST and use the flags
1498
// as they are prior to PTEST. Sometimes this requires the tested PTEST
1499
// operand to be replaced with an equivalent instruction that also sets the
1500
// flags.
1501
PTest->eraseFromParent();
1502
if (*NewOp != PredOpcode) {
1503
Pred->setDesc(get(*NewOp));
1504
bool succeeded = UpdateOperandRegClass(*Pred);
1505
(void)succeeded;
1506
assert(succeeded && "Operands have incompatible register classes!");
1507
Pred->addRegisterDefined(AArch64::NZCV, TRI);
1508
}
1509
1510
// Ensure that the flags def is live.
1511
if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1512
unsigned i = 0, e = Pred->getNumOperands();
1513
for (; i != e; ++i) {
1514
MachineOperand &MO = Pred->getOperand(i);
1515
if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1516
MO.setIsDead(false);
1517
break;
1518
}
1519
}
1520
}
1521
return true;
1522
}
1523
1524
/// Try to optimize a compare instruction. A compare instruction is an
1525
/// instruction which produces AArch64::NZCV. It can be truly compare
1526
/// instruction
1527
/// when there are no uses of its destination register.
1528
///
1529
/// The following steps are tried in order:
1530
/// 1. Convert CmpInstr into an unconditional version.
1531
/// 2. Remove CmpInstr if above there is an instruction producing a needed
1532
/// condition code or an instruction which can be converted into such an
1533
/// instruction.
1534
/// Only comparison with zero is supported.
1535
bool AArch64InstrInfo::optimizeCompareInstr(
1536
MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1537
int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1538
assert(CmpInstr.getParent());
1539
assert(MRI);
1540
1541
// Replace SUBSWrr with SUBWrr if NZCV is not used.
1542
int DeadNZCVIdx =
1543
CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1544
if (DeadNZCVIdx != -1) {
1545
if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1546
CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1547
CmpInstr.eraseFromParent();
1548
return true;
1549
}
1550
unsigned Opc = CmpInstr.getOpcode();
1551
unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1552
if (NewOpc == Opc)
1553
return false;
1554
const MCInstrDesc &MCID = get(NewOpc);
1555
CmpInstr.setDesc(MCID);
1556
CmpInstr.removeOperand(DeadNZCVIdx);
1557
bool succeeded = UpdateOperandRegClass(CmpInstr);
1558
(void)succeeded;
1559
assert(succeeded && "Some operands reg class are incompatible!");
1560
return true;
1561
}
1562
1563
if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1564
CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1565
return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1566
1567
if (SrcReg2 != 0)
1568
return false;
1569
1570
// CmpInstr is a Compare instruction if destination register is not used.
1571
if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1572
return false;
1573
1574
if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1575
return true;
1576
return (CmpValue == 0 || CmpValue == 1) &&
1577
removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1578
}
1579
1580
/// Get opcode of S version of Instr.
1581
/// If Instr is S version its opcode is returned.
1582
/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1583
/// or we are not interested in it.
1584
static unsigned sForm(MachineInstr &Instr) {
1585
switch (Instr.getOpcode()) {
1586
default:
1587
return AArch64::INSTRUCTION_LIST_END;
1588
1589
case AArch64::ADDSWrr:
1590
case AArch64::ADDSWri:
1591
case AArch64::ADDSXrr:
1592
case AArch64::ADDSXri:
1593
case AArch64::SUBSWrr:
1594
case AArch64::SUBSWri:
1595
case AArch64::SUBSXrr:
1596
case AArch64::SUBSXri:
1597
return Instr.getOpcode();
1598
1599
case AArch64::ADDWrr:
1600
return AArch64::ADDSWrr;
1601
case AArch64::ADDWri:
1602
return AArch64::ADDSWri;
1603
case AArch64::ADDXrr:
1604
return AArch64::ADDSXrr;
1605
case AArch64::ADDXri:
1606
return AArch64::ADDSXri;
1607
case AArch64::ADCWr:
1608
return AArch64::ADCSWr;
1609
case AArch64::ADCXr:
1610
return AArch64::ADCSXr;
1611
case AArch64::SUBWrr:
1612
return AArch64::SUBSWrr;
1613
case AArch64::SUBWri:
1614
return AArch64::SUBSWri;
1615
case AArch64::SUBXrr:
1616
return AArch64::SUBSXrr;
1617
case AArch64::SUBXri:
1618
return AArch64::SUBSXri;
1619
case AArch64::SBCWr:
1620
return AArch64::SBCSWr;
1621
case AArch64::SBCXr:
1622
return AArch64::SBCSXr;
1623
case AArch64::ANDWri:
1624
return AArch64::ANDSWri;
1625
case AArch64::ANDXri:
1626
return AArch64::ANDSXri;
1627
}
1628
}
1629
1630
/// Check if AArch64::NZCV should be alive in successors of MBB.
1631
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1632
for (auto *BB : MBB->successors())
1633
if (BB->isLiveIn(AArch64::NZCV))
1634
return true;
1635
return false;
1636
}
1637
1638
/// \returns The condition code operand index for \p Instr if it is a branch
1639
/// or select and -1 otherwise.
1640
static int
1641
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1642
switch (Instr.getOpcode()) {
1643
default:
1644
return -1;
1645
1646
case AArch64::Bcc: {
1647
int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1648
assert(Idx >= 2);
1649
return Idx - 2;
1650
}
1651
1652
case AArch64::CSINVWr:
1653
case AArch64::CSINVXr:
1654
case AArch64::CSINCWr:
1655
case AArch64::CSINCXr:
1656
case AArch64::CSELWr:
1657
case AArch64::CSELXr:
1658
case AArch64::CSNEGWr:
1659
case AArch64::CSNEGXr:
1660
case AArch64::FCSELSrrr:
1661
case AArch64::FCSELDrrr: {
1662
int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1663
assert(Idx >= 1);
1664
return Idx - 1;
1665
}
1666
}
1667
}
1668
1669
/// Find a condition code used by the instruction.
1670
/// Returns AArch64CC::Invalid if either the instruction does not use condition
1671
/// codes or we don't optimize CmpInstr in the presence of such instructions.
1672
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1673
int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1674
return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1675
Instr.getOperand(CCIdx).getImm())
1676
: AArch64CC::Invalid;
1677
}
1678
1679
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1680
assert(CC != AArch64CC::Invalid);
1681
UsedNZCV UsedFlags;
1682
switch (CC) {
1683
default:
1684
break;
1685
1686
case AArch64CC::EQ: // Z set
1687
case AArch64CC::NE: // Z clear
1688
UsedFlags.Z = true;
1689
break;
1690
1691
case AArch64CC::HI: // Z clear and C set
1692
case AArch64CC::LS: // Z set or C clear
1693
UsedFlags.Z = true;
1694
[[fallthrough]];
1695
case AArch64CC::HS: // C set
1696
case AArch64CC::LO: // C clear
1697
UsedFlags.C = true;
1698
break;
1699
1700
case AArch64CC::MI: // N set
1701
case AArch64CC::PL: // N clear
1702
UsedFlags.N = true;
1703
break;
1704
1705
case AArch64CC::VS: // V set
1706
case AArch64CC::VC: // V clear
1707
UsedFlags.V = true;
1708
break;
1709
1710
case AArch64CC::GT: // Z clear, N and V the same
1711
case AArch64CC::LE: // Z set, N and V differ
1712
UsedFlags.Z = true;
1713
[[fallthrough]];
1714
case AArch64CC::GE: // N and V the same
1715
case AArch64CC::LT: // N and V differ
1716
UsedFlags.N = true;
1717
UsedFlags.V = true;
1718
break;
1719
}
1720
return UsedFlags;
1721
}
1722
1723
/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1724
/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1725
/// \returns std::nullopt otherwise.
1726
///
1727
/// Collect instructions using that flags in \p CCUseInstrs if provided.
1728
std::optional<UsedNZCV>
1729
llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1730
const TargetRegisterInfo &TRI,
1731
SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1732
MachineBasicBlock *CmpParent = CmpInstr.getParent();
1733
if (MI.getParent() != CmpParent)
1734
return std::nullopt;
1735
1736
if (areCFlagsAliveInSuccessors(CmpParent))
1737
return std::nullopt;
1738
1739
UsedNZCV NZCVUsedAfterCmp;
1740
for (MachineInstr &Instr : instructionsWithoutDebug(
1741
std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1742
if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1743
AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1744
if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1745
return std::nullopt;
1746
NZCVUsedAfterCmp |= getUsedNZCV(CC);
1747
if (CCUseInstrs)
1748
CCUseInstrs->push_back(&Instr);
1749
}
1750
if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1751
break;
1752
}
1753
return NZCVUsedAfterCmp;
1754
}
1755
1756
static bool isADDSRegImm(unsigned Opcode) {
1757
return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1758
}
1759
1760
static bool isSUBSRegImm(unsigned Opcode) {
1761
return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1762
}
1763
1764
/// Check if CmpInstr can be substituted by MI.
1765
///
1766
/// CmpInstr can be substituted:
1767
/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1768
/// - and, MI and CmpInstr are from the same MachineBB
1769
/// - and, condition flags are not alive in successors of the CmpInstr parent
1770
/// - and, if MI opcode is the S form there must be no defs of flags between
1771
/// MI and CmpInstr
1772
/// or if MI opcode is not the S form there must be neither defs of flags
1773
/// nor uses of flags between MI and CmpInstr.
1774
/// - and, if C/V flags are not used after CmpInstr
1775
/// or if N flag is used but MI produces poison value if signed overflow
1776
/// occurs.
1777
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1778
const TargetRegisterInfo &TRI) {
1779
// NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1780
// that may or may not set flags.
1781
assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1782
1783
const unsigned CmpOpcode = CmpInstr.getOpcode();
1784
if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1785
return false;
1786
1787
assert((CmpInstr.getOperand(2).isImm() &&
1788
CmpInstr.getOperand(2).getImm() == 0) &&
1789
"Caller guarantees that CmpInstr compares with constant 0");
1790
1791
std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1792
if (!NZVCUsed || NZVCUsed->C)
1793
return false;
1794
1795
// CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1796
// '%vreg = add ...' or '%vreg = sub ...'.
1797
// Condition flag V is used to indicate signed overflow.
1798
// 1) MI and CmpInstr set N and V to the same value.
1799
// 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1800
// signed overflow occurs, so CmpInstr could still be simplified away.
1801
if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1802
return false;
1803
1804
AccessKind AccessToCheck = AK_Write;
1805
if (sForm(MI) != MI.getOpcode())
1806
AccessToCheck = AK_All;
1807
return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1808
}
1809
1810
/// Substitute an instruction comparing to zero with another instruction
1811
/// which produces needed condition flags.
1812
///
1813
/// Return true on success.
1814
bool AArch64InstrInfo::substituteCmpToZero(
1815
MachineInstr &CmpInstr, unsigned SrcReg,
1816
const MachineRegisterInfo &MRI) const {
1817
// Get the unique definition of SrcReg.
1818
MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1819
if (!MI)
1820
return false;
1821
1822
const TargetRegisterInfo &TRI = getRegisterInfo();
1823
1824
unsigned NewOpc = sForm(*MI);
1825
if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1826
return false;
1827
1828
if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1829
return false;
1830
1831
// Update the instruction to set NZCV.
1832
MI->setDesc(get(NewOpc));
1833
CmpInstr.eraseFromParent();
1834
bool succeeded = UpdateOperandRegClass(*MI);
1835
(void)succeeded;
1836
assert(succeeded && "Some operands reg class are incompatible!");
1837
MI->addRegisterDefined(AArch64::NZCV, &TRI);
1838
return true;
1839
}
1840
1841
/// \returns True if \p CmpInstr can be removed.
1842
///
1843
/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1844
/// codes used in \p CCUseInstrs must be inverted.
1845
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1846
int CmpValue, const TargetRegisterInfo &TRI,
1847
SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1848
bool &IsInvertCC) {
1849
assert((CmpValue == 0 || CmpValue == 1) &&
1850
"Only comparisons to 0 or 1 considered for removal!");
1851
1852
// MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1853
unsigned MIOpc = MI.getOpcode();
1854
if (MIOpc == AArch64::CSINCWr) {
1855
if (MI.getOperand(1).getReg() != AArch64::WZR ||
1856
MI.getOperand(2).getReg() != AArch64::WZR)
1857
return false;
1858
} else if (MIOpc == AArch64::CSINCXr) {
1859
if (MI.getOperand(1).getReg() != AArch64::XZR ||
1860
MI.getOperand(2).getReg() != AArch64::XZR)
1861
return false;
1862
} else {
1863
return false;
1864
}
1865
AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1866
if (MICC == AArch64CC::Invalid)
1867
return false;
1868
1869
// NZCV needs to be defined
1870
if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1871
return false;
1872
1873
// CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1874
const unsigned CmpOpcode = CmpInstr.getOpcode();
1875
bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1876
if (CmpValue && !IsSubsRegImm)
1877
return false;
1878
if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1879
return false;
1880
1881
// MI conditions allowed: eq, ne, mi, pl
1882
UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1883
if (MIUsedNZCV.C || MIUsedNZCV.V)
1884
return false;
1885
1886
std::optional<UsedNZCV> NZCVUsedAfterCmp =
1887
examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1888
// Condition flags are not used in CmpInstr basic block successors and only
1889
// Z or N flags allowed to be used after CmpInstr within its basic block
1890
if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1891
return false;
1892
// Z or N flag used after CmpInstr must correspond to the flag used in MI
1893
if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1894
(MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1895
return false;
1896
// If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1897
if (MIUsedNZCV.N && !CmpValue)
1898
return false;
1899
1900
// There must be no defs of flags between MI and CmpInstr
1901
if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1902
return false;
1903
1904
// Condition code is inverted in the following cases:
1905
// 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1906
// 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1907
IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1908
(!CmpValue && MICC == AArch64CC::NE);
1909
return true;
1910
}
1911
1912
/// Remove comparison in csinc-cmp sequence
1913
///
1914
/// Examples:
1915
/// 1. \code
1916
/// csinc w9, wzr, wzr, ne
1917
/// cmp w9, #0
1918
/// b.eq
1919
/// \endcode
1920
/// to
1921
/// \code
1922
/// csinc w9, wzr, wzr, ne
1923
/// b.ne
1924
/// \endcode
1925
///
1926
/// 2. \code
1927
/// csinc x2, xzr, xzr, mi
1928
/// cmp x2, #1
1929
/// b.pl
1930
/// \endcode
1931
/// to
1932
/// \code
1933
/// csinc x2, xzr, xzr, mi
1934
/// b.pl
1935
/// \endcode
1936
///
1937
/// \param CmpInstr comparison instruction
1938
/// \return True when comparison removed
1939
bool AArch64InstrInfo::removeCmpToZeroOrOne(
1940
MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1941
const MachineRegisterInfo &MRI) const {
1942
MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1943
if (!MI)
1944
return false;
1945
const TargetRegisterInfo &TRI = getRegisterInfo();
1946
SmallVector<MachineInstr *, 4> CCUseInstrs;
1947
bool IsInvertCC = false;
1948
if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1949
IsInvertCC))
1950
return false;
1951
// Make transformation
1952
CmpInstr.eraseFromParent();
1953
if (IsInvertCC) {
1954
// Invert condition codes in CmpInstr CC users
1955
for (MachineInstr *CCUseInstr : CCUseInstrs) {
1956
int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1957
assert(Idx >= 0 && "Unexpected instruction using CC.");
1958
MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1959
AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1960
static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1961
CCOperand.setImm(CCUse);
1962
}
1963
}
1964
return true;
1965
}
1966
1967
bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1968
if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1969
MI.getOpcode() != AArch64::CATCHRET)
1970
return false;
1971
1972
MachineBasicBlock &MBB = *MI.getParent();
1973
auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1974
auto TRI = Subtarget.getRegisterInfo();
1975
DebugLoc DL = MI.getDebugLoc();
1976
1977
if (MI.getOpcode() == AArch64::CATCHRET) {
1978
// Skip to the first instruction before the epilog.
1979
const TargetInstrInfo *TII =
1980
MBB.getParent()->getSubtarget().getInstrInfo();
1981
MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1982
auto MBBI = MachineBasicBlock::iterator(MI);
1983
MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1984
while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1985
FirstEpilogSEH != MBB.begin())
1986
FirstEpilogSEH = std::prev(FirstEpilogSEH);
1987
if (FirstEpilogSEH != MBB.begin())
1988
FirstEpilogSEH = std::next(FirstEpilogSEH);
1989
BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1990
.addReg(AArch64::X0, RegState::Define)
1991
.addMBB(TargetMBB);
1992
BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1993
.addReg(AArch64::X0, RegState::Define)
1994
.addReg(AArch64::X0)
1995
.addMBB(TargetMBB)
1996
.addImm(0);
1997
return true;
1998
}
1999
2000
Register Reg = MI.getOperand(0).getReg();
2001
Module &M = *MBB.getParent()->getFunction().getParent();
2002
if (M.getStackProtectorGuard() == "sysreg") {
2003
const AArch64SysReg::SysReg *SrcReg =
2004
AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2005
if (!SrcReg)
2006
report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2007
2008
// mrs xN, sysreg
2009
BuildMI(MBB, MI, DL, get(AArch64::MRS))
2010
.addDef(Reg, RegState::Renamable)
2011
.addImm(SrcReg->Encoding);
2012
int Offset = M.getStackProtectorGuardOffset();
2013
if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2014
// ldr xN, [xN, #offset]
2015
BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2016
.addDef(Reg)
2017
.addUse(Reg, RegState::Kill)
2018
.addImm(Offset / 8);
2019
} else if (Offset >= -256 && Offset <= 255) {
2020
// ldur xN, [xN, #offset]
2021
BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2022
.addDef(Reg)
2023
.addUse(Reg, RegState::Kill)
2024
.addImm(Offset);
2025
} else if (Offset >= -4095 && Offset <= 4095) {
2026
if (Offset > 0) {
2027
// add xN, xN, #offset
2028
BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2029
.addDef(Reg)
2030
.addUse(Reg, RegState::Kill)
2031
.addImm(Offset)
2032
.addImm(0);
2033
} else {
2034
// sub xN, xN, #offset
2035
BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2036
.addDef(Reg)
2037
.addUse(Reg, RegState::Kill)
2038
.addImm(-Offset)
2039
.addImm(0);
2040
}
2041
// ldr xN, [xN]
2042
BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2043
.addDef(Reg)
2044
.addUse(Reg, RegState::Kill)
2045
.addImm(0);
2046
} else {
2047
// Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2048
// than 23760.
2049
// It might be nice to use AArch64::MOVi32imm here, which would get
2050
// expanded in PreSched2 after PostRA, but our lone scratch Reg already
2051
// contains the MRS result. findScratchNonCalleeSaveRegister() in
2052
// AArch64FrameLowering might help us find such a scratch register
2053
// though. If we failed to find a scratch register, we could emit a
2054
// stream of add instructions to build up the immediate. Or, we could try
2055
// to insert a AArch64::MOVi32imm before register allocation so that we
2056
// didn't need to scavenge for a scratch register.
2057
report_fatal_error("Unable to encode Stack Protector Guard Offset");
2058
}
2059
MBB.erase(MI);
2060
return true;
2061
}
2062
2063
const GlobalValue *GV =
2064
cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2065
const TargetMachine &TM = MBB.getParent()->getTarget();
2066
unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2067
const unsigned char MO_NC = AArch64II::MO_NC;
2068
2069
if ((OpFlags & AArch64II::MO_GOT) != 0) {
2070
BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2071
.addGlobalAddress(GV, 0, OpFlags);
2072
if (Subtarget.isTargetILP32()) {
2073
unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2074
BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2075
.addDef(Reg32, RegState::Dead)
2076
.addUse(Reg, RegState::Kill)
2077
.addImm(0)
2078
.addMemOperand(*MI.memoperands_begin())
2079
.addDef(Reg, RegState::Implicit);
2080
} else {
2081
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2082
.addReg(Reg, RegState::Kill)
2083
.addImm(0)
2084
.addMemOperand(*MI.memoperands_begin());
2085
}
2086
} else if (TM.getCodeModel() == CodeModel::Large) {
2087
assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2088
BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2089
.addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2090
.addImm(0);
2091
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2092
.addReg(Reg, RegState::Kill)
2093
.addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2094
.addImm(16);
2095
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2096
.addReg(Reg, RegState::Kill)
2097
.addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2098
.addImm(32);
2099
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2100
.addReg(Reg, RegState::Kill)
2101
.addGlobalAddress(GV, 0, AArch64II::MO_G3)
2102
.addImm(48);
2103
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2104
.addReg(Reg, RegState::Kill)
2105
.addImm(0)
2106
.addMemOperand(*MI.memoperands_begin());
2107
} else if (TM.getCodeModel() == CodeModel::Tiny) {
2108
BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2109
.addGlobalAddress(GV, 0, OpFlags);
2110
} else {
2111
BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2112
.addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2113
unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2114
if (Subtarget.isTargetILP32()) {
2115
unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2116
BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2117
.addDef(Reg32, RegState::Dead)
2118
.addUse(Reg, RegState::Kill)
2119
.addGlobalAddress(GV, 0, LoFlags)
2120
.addMemOperand(*MI.memoperands_begin())
2121
.addDef(Reg, RegState::Implicit);
2122
} else {
2123
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2124
.addReg(Reg, RegState::Kill)
2125
.addGlobalAddress(GV, 0, LoFlags)
2126
.addMemOperand(*MI.memoperands_begin());
2127
}
2128
}
2129
2130
MBB.erase(MI);
2131
2132
return true;
2133
}
2134
2135
// Return true if this instruction simply sets its single destination register
2136
// to zero. This is equivalent to a register rename of the zero-register.
2137
bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2138
switch (MI.getOpcode()) {
2139
default:
2140
break;
2141
case AArch64::MOVZWi:
2142
case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2143
if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2144
assert(MI.getDesc().getNumOperands() == 3 &&
2145
MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2146
return true;
2147
}
2148
break;
2149
case AArch64::ANDWri: // and Rd, Rzr, #imm
2150
return MI.getOperand(1).getReg() == AArch64::WZR;
2151
case AArch64::ANDXri:
2152
return MI.getOperand(1).getReg() == AArch64::XZR;
2153
case TargetOpcode::COPY:
2154
return MI.getOperand(1).getReg() == AArch64::WZR;
2155
}
2156
return false;
2157
}
2158
2159
// Return true if this instruction simply renames a general register without
2160
// modifying bits.
2161
bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2162
switch (MI.getOpcode()) {
2163
default:
2164
break;
2165
case TargetOpcode::COPY: {
2166
// GPR32 copies will by lowered to ORRXrs
2167
Register DstReg = MI.getOperand(0).getReg();
2168
return (AArch64::GPR32RegClass.contains(DstReg) ||
2169
AArch64::GPR64RegClass.contains(DstReg));
2170
}
2171
case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2172
if (MI.getOperand(1).getReg() == AArch64::XZR) {
2173
assert(MI.getDesc().getNumOperands() == 4 &&
2174
MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2175
return true;
2176
}
2177
break;
2178
case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2179
if (MI.getOperand(2).getImm() == 0) {
2180
assert(MI.getDesc().getNumOperands() == 4 &&
2181
MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2182
return true;
2183
}
2184
break;
2185
}
2186
return false;
2187
}
2188
2189
// Return true if this instruction simply renames a general register without
2190
// modifying bits.
2191
bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2192
switch (MI.getOpcode()) {
2193
default:
2194
break;
2195
case TargetOpcode::COPY: {
2196
Register DstReg = MI.getOperand(0).getReg();
2197
return AArch64::FPR128RegClass.contains(DstReg);
2198
}
2199
case AArch64::ORRv16i8:
2200
if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2201
assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2202
"invalid ORRv16i8 operands");
2203
return true;
2204
}
2205
break;
2206
}
2207
return false;
2208
}
2209
2210
Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2211
int &FrameIndex) const {
2212
switch (MI.getOpcode()) {
2213
default:
2214
break;
2215
case AArch64::LDRWui:
2216
case AArch64::LDRXui:
2217
case AArch64::LDRBui:
2218
case AArch64::LDRHui:
2219
case AArch64::LDRSui:
2220
case AArch64::LDRDui:
2221
case AArch64::LDRQui:
2222
case AArch64::LDR_PXI:
2223
if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2224
MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2225
FrameIndex = MI.getOperand(1).getIndex();
2226
return MI.getOperand(0).getReg();
2227
}
2228
break;
2229
}
2230
2231
return 0;
2232
}
2233
2234
Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2235
int &FrameIndex) const {
2236
switch (MI.getOpcode()) {
2237
default:
2238
break;
2239
case AArch64::STRWui:
2240
case AArch64::STRXui:
2241
case AArch64::STRBui:
2242
case AArch64::STRHui:
2243
case AArch64::STRSui:
2244
case AArch64::STRDui:
2245
case AArch64::STRQui:
2246
case AArch64::STR_PXI:
2247
if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2248
MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2249
FrameIndex = MI.getOperand(1).getIndex();
2250
return MI.getOperand(0).getReg();
2251
}
2252
break;
2253
}
2254
return 0;
2255
}
2256
2257
/// Check all MachineMemOperands for a hint to suppress pairing.
2258
bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2259
return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2260
return MMO->getFlags() & MOSuppressPair;
2261
});
2262
}
2263
2264
/// Set a flag on the first MachineMemOperand to suppress pairing.
2265
void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2266
if (MI.memoperands_empty())
2267
return;
2268
(*MI.memoperands_begin())->setFlags(MOSuppressPair);
2269
}
2270
2271
/// Check all MachineMemOperands for a hint that the load/store is strided.
2272
bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2273
return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2274
return MMO->getFlags() & MOStridedAccess;
2275
});
2276
}
2277
2278
bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2279
switch (Opc) {
2280
default:
2281
return false;
2282
case AArch64::STURSi:
2283
case AArch64::STRSpre:
2284
case AArch64::STURDi:
2285
case AArch64::STRDpre:
2286
case AArch64::STURQi:
2287
case AArch64::STRQpre:
2288
case AArch64::STURBBi:
2289
case AArch64::STURHHi:
2290
case AArch64::STURWi:
2291
case AArch64::STRWpre:
2292
case AArch64::STURXi:
2293
case AArch64::STRXpre:
2294
case AArch64::LDURSi:
2295
case AArch64::LDRSpre:
2296
case AArch64::LDURDi:
2297
case AArch64::LDRDpre:
2298
case AArch64::LDURQi:
2299
case AArch64::LDRQpre:
2300
case AArch64::LDURWi:
2301
case AArch64::LDRWpre:
2302
case AArch64::LDURXi:
2303
case AArch64::LDRXpre:
2304
case AArch64::LDRSWpre:
2305
case AArch64::LDURSWi:
2306
case AArch64::LDURHHi:
2307
case AArch64::LDURBBi:
2308
case AArch64::LDURSBWi:
2309
case AArch64::LDURSHWi:
2310
return true;
2311
}
2312
}
2313
2314
std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2315
switch (Opc) {
2316
default: return {};
2317
case AArch64::PRFMui: return AArch64::PRFUMi;
2318
case AArch64::LDRXui: return AArch64::LDURXi;
2319
case AArch64::LDRWui: return AArch64::LDURWi;
2320
case AArch64::LDRBui: return AArch64::LDURBi;
2321
case AArch64::LDRHui: return AArch64::LDURHi;
2322
case AArch64::LDRSui: return AArch64::LDURSi;
2323
case AArch64::LDRDui: return AArch64::LDURDi;
2324
case AArch64::LDRQui: return AArch64::LDURQi;
2325
case AArch64::LDRBBui: return AArch64::LDURBBi;
2326
case AArch64::LDRHHui: return AArch64::LDURHHi;
2327
case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2328
case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2329
case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2330
case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2331
case AArch64::LDRSWui: return AArch64::LDURSWi;
2332
case AArch64::STRXui: return AArch64::STURXi;
2333
case AArch64::STRWui: return AArch64::STURWi;
2334
case AArch64::STRBui: return AArch64::STURBi;
2335
case AArch64::STRHui: return AArch64::STURHi;
2336
case AArch64::STRSui: return AArch64::STURSi;
2337
case AArch64::STRDui: return AArch64::STURDi;
2338
case AArch64::STRQui: return AArch64::STURQi;
2339
case AArch64::STRBBui: return AArch64::STURBBi;
2340
case AArch64::STRHHui: return AArch64::STURHHi;
2341
}
2342
}
2343
2344
unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2345
switch (Opc) {
2346
default:
2347
return 2;
2348
case AArch64::LDPXi:
2349
case AArch64::LDPDi:
2350
case AArch64::STPXi:
2351
case AArch64::STPDi:
2352
case AArch64::LDNPXi:
2353
case AArch64::LDNPDi:
2354
case AArch64::STNPXi:
2355
case AArch64::STNPDi:
2356
case AArch64::LDPQi:
2357
case AArch64::STPQi:
2358
case AArch64::LDNPQi:
2359
case AArch64::STNPQi:
2360
case AArch64::LDPWi:
2361
case AArch64::LDPSi:
2362
case AArch64::STPWi:
2363
case AArch64::STPSi:
2364
case AArch64::LDNPWi:
2365
case AArch64::LDNPSi:
2366
case AArch64::STNPWi:
2367
case AArch64::STNPSi:
2368
case AArch64::LDG:
2369
case AArch64::STGPi:
2370
2371
case AArch64::LD1B_IMM:
2372
case AArch64::LD1B_H_IMM:
2373
case AArch64::LD1B_S_IMM:
2374
case AArch64::LD1B_D_IMM:
2375
case AArch64::LD1SB_H_IMM:
2376
case AArch64::LD1SB_S_IMM:
2377
case AArch64::LD1SB_D_IMM:
2378
case AArch64::LD1H_IMM:
2379
case AArch64::LD1H_S_IMM:
2380
case AArch64::LD1H_D_IMM:
2381
case AArch64::LD1SH_S_IMM:
2382
case AArch64::LD1SH_D_IMM:
2383
case AArch64::LD1W_IMM:
2384
case AArch64::LD1W_D_IMM:
2385
case AArch64::LD1SW_D_IMM:
2386
case AArch64::LD1D_IMM:
2387
2388
case AArch64::LD2B_IMM:
2389
case AArch64::LD2H_IMM:
2390
case AArch64::LD2W_IMM:
2391
case AArch64::LD2D_IMM:
2392
case AArch64::LD3B_IMM:
2393
case AArch64::LD3H_IMM:
2394
case AArch64::LD3W_IMM:
2395
case AArch64::LD3D_IMM:
2396
case AArch64::LD4B_IMM:
2397
case AArch64::LD4H_IMM:
2398
case AArch64::LD4W_IMM:
2399
case AArch64::LD4D_IMM:
2400
2401
case AArch64::ST1B_IMM:
2402
case AArch64::ST1B_H_IMM:
2403
case AArch64::ST1B_S_IMM:
2404
case AArch64::ST1B_D_IMM:
2405
case AArch64::ST1H_IMM:
2406
case AArch64::ST1H_S_IMM:
2407
case AArch64::ST1H_D_IMM:
2408
case AArch64::ST1W_IMM:
2409
case AArch64::ST1W_D_IMM:
2410
case AArch64::ST1D_IMM:
2411
2412
case AArch64::ST2B_IMM:
2413
case AArch64::ST2H_IMM:
2414
case AArch64::ST2W_IMM:
2415
case AArch64::ST2D_IMM:
2416
case AArch64::ST3B_IMM:
2417
case AArch64::ST3H_IMM:
2418
case AArch64::ST3W_IMM:
2419
case AArch64::ST3D_IMM:
2420
case AArch64::ST4B_IMM:
2421
case AArch64::ST4H_IMM:
2422
case AArch64::ST4W_IMM:
2423
case AArch64::ST4D_IMM:
2424
2425
case AArch64::LD1RB_IMM:
2426
case AArch64::LD1RB_H_IMM:
2427
case AArch64::LD1RB_S_IMM:
2428
case AArch64::LD1RB_D_IMM:
2429
case AArch64::LD1RSB_H_IMM:
2430
case AArch64::LD1RSB_S_IMM:
2431
case AArch64::LD1RSB_D_IMM:
2432
case AArch64::LD1RH_IMM:
2433
case AArch64::LD1RH_S_IMM:
2434
case AArch64::LD1RH_D_IMM:
2435
case AArch64::LD1RSH_S_IMM:
2436
case AArch64::LD1RSH_D_IMM:
2437
case AArch64::LD1RW_IMM:
2438
case AArch64::LD1RW_D_IMM:
2439
case AArch64::LD1RSW_IMM:
2440
case AArch64::LD1RD_IMM:
2441
2442
case AArch64::LDNT1B_ZRI:
2443
case AArch64::LDNT1H_ZRI:
2444
case AArch64::LDNT1W_ZRI:
2445
case AArch64::LDNT1D_ZRI:
2446
case AArch64::STNT1B_ZRI:
2447
case AArch64::STNT1H_ZRI:
2448
case AArch64::STNT1W_ZRI:
2449
case AArch64::STNT1D_ZRI:
2450
2451
case AArch64::LDNF1B_IMM:
2452
case AArch64::LDNF1B_H_IMM:
2453
case AArch64::LDNF1B_S_IMM:
2454
case AArch64::LDNF1B_D_IMM:
2455
case AArch64::LDNF1SB_H_IMM:
2456
case AArch64::LDNF1SB_S_IMM:
2457
case AArch64::LDNF1SB_D_IMM:
2458
case AArch64::LDNF1H_IMM:
2459
case AArch64::LDNF1H_S_IMM:
2460
case AArch64::LDNF1H_D_IMM:
2461
case AArch64::LDNF1SH_S_IMM:
2462
case AArch64::LDNF1SH_D_IMM:
2463
case AArch64::LDNF1W_IMM:
2464
case AArch64::LDNF1W_D_IMM:
2465
case AArch64::LDNF1SW_D_IMM:
2466
case AArch64::LDNF1D_IMM:
2467
return 3;
2468
case AArch64::ADDG:
2469
case AArch64::STGi:
2470
case AArch64::LDR_PXI:
2471
case AArch64::STR_PXI:
2472
return 2;
2473
}
2474
}
2475
2476
bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2477
switch (MI.getOpcode()) {
2478
default:
2479
return false;
2480
// Scaled instructions.
2481
case AArch64::STRSui:
2482
case AArch64::STRDui:
2483
case AArch64::STRQui:
2484
case AArch64::STRXui:
2485
case AArch64::STRWui:
2486
case AArch64::LDRSui:
2487
case AArch64::LDRDui:
2488
case AArch64::LDRQui:
2489
case AArch64::LDRXui:
2490
case AArch64::LDRWui:
2491
case AArch64::LDRSWui:
2492
// Unscaled instructions.
2493
case AArch64::STURSi:
2494
case AArch64::STRSpre:
2495
case AArch64::STURDi:
2496
case AArch64::STRDpre:
2497
case AArch64::STURQi:
2498
case AArch64::STRQpre:
2499
case AArch64::STURWi:
2500
case AArch64::STRWpre:
2501
case AArch64::STURXi:
2502
case AArch64::STRXpre:
2503
case AArch64::LDURSi:
2504
case AArch64::LDRSpre:
2505
case AArch64::LDURDi:
2506
case AArch64::LDRDpre:
2507
case AArch64::LDURQi:
2508
case AArch64::LDRQpre:
2509
case AArch64::LDURWi:
2510
case AArch64::LDRWpre:
2511
case AArch64::LDURXi:
2512
case AArch64::LDRXpre:
2513
case AArch64::LDURSWi:
2514
case AArch64::LDRSWpre:
2515
return true;
2516
}
2517
}
2518
2519
bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2520
switch (MI.getOpcode()) {
2521
default:
2522
assert((!MI.isCall() || !MI.isReturn()) &&
2523
"Unexpected instruction - was a new tail call opcode introduced?");
2524
return false;
2525
case AArch64::TCRETURNdi:
2526
case AArch64::TCRETURNri:
2527
case AArch64::TCRETURNrix16x17:
2528
case AArch64::TCRETURNrix17:
2529
case AArch64::TCRETURNrinotx16:
2530
case AArch64::TCRETURNriALL:
2531
case AArch64::AUTH_TCRETURN:
2532
case AArch64::AUTH_TCRETURN_BTI:
2533
return true;
2534
}
2535
}
2536
2537
unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2538
switch (Opc) {
2539
default:
2540
llvm_unreachable("Opcode has no flag setting equivalent!");
2541
// 32-bit cases:
2542
case AArch64::ADDWri:
2543
return AArch64::ADDSWri;
2544
case AArch64::ADDWrr:
2545
return AArch64::ADDSWrr;
2546
case AArch64::ADDWrs:
2547
return AArch64::ADDSWrs;
2548
case AArch64::ADDWrx:
2549
return AArch64::ADDSWrx;
2550
case AArch64::ANDWri:
2551
return AArch64::ANDSWri;
2552
case AArch64::ANDWrr:
2553
return AArch64::ANDSWrr;
2554
case AArch64::ANDWrs:
2555
return AArch64::ANDSWrs;
2556
case AArch64::BICWrr:
2557
return AArch64::BICSWrr;
2558
case AArch64::BICWrs:
2559
return AArch64::BICSWrs;
2560
case AArch64::SUBWri:
2561
return AArch64::SUBSWri;
2562
case AArch64::SUBWrr:
2563
return AArch64::SUBSWrr;
2564
case AArch64::SUBWrs:
2565
return AArch64::SUBSWrs;
2566
case AArch64::SUBWrx:
2567
return AArch64::SUBSWrx;
2568
// 64-bit cases:
2569
case AArch64::ADDXri:
2570
return AArch64::ADDSXri;
2571
case AArch64::ADDXrr:
2572
return AArch64::ADDSXrr;
2573
case AArch64::ADDXrs:
2574
return AArch64::ADDSXrs;
2575
case AArch64::ADDXrx:
2576
return AArch64::ADDSXrx;
2577
case AArch64::ANDXri:
2578
return AArch64::ANDSXri;
2579
case AArch64::ANDXrr:
2580
return AArch64::ANDSXrr;
2581
case AArch64::ANDXrs:
2582
return AArch64::ANDSXrs;
2583
case AArch64::BICXrr:
2584
return AArch64::BICSXrr;
2585
case AArch64::BICXrs:
2586
return AArch64::BICSXrs;
2587
case AArch64::SUBXri:
2588
return AArch64::SUBSXri;
2589
case AArch64::SUBXrr:
2590
return AArch64::SUBSXrr;
2591
case AArch64::SUBXrs:
2592
return AArch64::SUBSXrs;
2593
case AArch64::SUBXrx:
2594
return AArch64::SUBSXrx;
2595
// SVE instructions:
2596
case AArch64::AND_PPzPP:
2597
return AArch64::ANDS_PPzPP;
2598
case AArch64::BIC_PPzPP:
2599
return AArch64::BICS_PPzPP;
2600
case AArch64::EOR_PPzPP:
2601
return AArch64::EORS_PPzPP;
2602
case AArch64::NAND_PPzPP:
2603
return AArch64::NANDS_PPzPP;
2604
case AArch64::NOR_PPzPP:
2605
return AArch64::NORS_PPzPP;
2606
case AArch64::ORN_PPzPP:
2607
return AArch64::ORNS_PPzPP;
2608
case AArch64::ORR_PPzPP:
2609
return AArch64::ORRS_PPzPP;
2610
case AArch64::BRKA_PPzP:
2611
return AArch64::BRKAS_PPzP;
2612
case AArch64::BRKPA_PPzPP:
2613
return AArch64::BRKPAS_PPzPP;
2614
case AArch64::BRKB_PPzP:
2615
return AArch64::BRKBS_PPzP;
2616
case AArch64::BRKPB_PPzPP:
2617
return AArch64::BRKPBS_PPzPP;
2618
case AArch64::BRKN_PPzP:
2619
return AArch64::BRKNS_PPzP;
2620
case AArch64::RDFFR_PPz:
2621
return AArch64::RDFFRS_PPz;
2622
case AArch64::PTRUE_B:
2623
return AArch64::PTRUES_B;
2624
}
2625
}
2626
2627
// Is this a candidate for ld/st merging or pairing? For example, we don't
2628
// touch volatiles or load/stores that have a hint to avoid pair formation.
2629
bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2630
2631
bool IsPreLdSt = isPreLdSt(MI);
2632
2633
// If this is a volatile load/store, don't mess with it.
2634
if (MI.hasOrderedMemoryRef())
2635
return false;
2636
2637
// Make sure this is a reg/fi+imm (as opposed to an address reloc).
2638
// For Pre-inc LD/ST, the operand is shifted by one.
2639
assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2640
MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2641
"Expected a reg or frame index operand.");
2642
2643
// For Pre-indexed addressing quadword instructions, the third operand is the
2644
// immediate value.
2645
bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2646
2647
if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2648
return false;
2649
2650
// Can't merge/pair if the instruction modifies the base register.
2651
// e.g., ldr x0, [x0]
2652
// This case will never occur with an FI base.
2653
// However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2654
// STR<S,D,Q,W,X>pre, it can be merged.
2655
// For example:
2656
// ldr q0, [x11, #32]!
2657
// ldr q1, [x11, #16]
2658
// to
2659
// ldp q0, q1, [x11, #32]!
2660
if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2661
Register BaseReg = MI.getOperand(1).getReg();
2662
const TargetRegisterInfo *TRI = &getRegisterInfo();
2663
if (MI.modifiesRegister(BaseReg, TRI))
2664
return false;
2665
}
2666
2667
// Check if this load/store has a hint to avoid pair formation.
2668
// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2669
if (isLdStPairSuppressed(MI))
2670
return false;
2671
2672
// Do not pair any callee-save store/reload instructions in the
2673
// prologue/epilogue if the CFI information encoded the operations as separate
2674
// instructions, as that will cause the size of the actual prologue to mismatch
2675
// with the prologue size recorded in the Windows CFI.
2676
const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2677
bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2678
MI.getMF()->getFunction().needsUnwindTableEntry();
2679
if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2680
MI.getFlag(MachineInstr::FrameDestroy)))
2681
return false;
2682
2683
// On some CPUs quad load/store pairs are slower than two single load/stores.
2684
if (Subtarget.isPaired128Slow()) {
2685
switch (MI.getOpcode()) {
2686
default:
2687
break;
2688
case AArch64::LDURQi:
2689
case AArch64::STURQi:
2690
case AArch64::LDRQui:
2691
case AArch64::STRQui:
2692
return false;
2693
}
2694
}
2695
2696
return true;
2697
}
2698
2699
bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2700
const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2701
int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2702
const TargetRegisterInfo *TRI) const {
2703
if (!LdSt.mayLoadOrStore())
2704
return false;
2705
2706
const MachineOperand *BaseOp;
2707
TypeSize WidthN(0, false);
2708
if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2709
WidthN, TRI))
2710
return false;
2711
// The maximum vscale is 16 under AArch64, return the maximal extent for the
2712
// vector.
2713
Width = LocationSize::precise(WidthN);
2714
BaseOps.push_back(BaseOp);
2715
return true;
2716
}
2717
2718
std::optional<ExtAddrMode>
2719
AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2720
const TargetRegisterInfo *TRI) const {
2721
const MachineOperand *Base; // Filled with the base operand of MI.
2722
int64_t Offset; // Filled with the offset of MI.
2723
bool OffsetIsScalable;
2724
if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2725
return std::nullopt;
2726
2727
if (!Base->isReg())
2728
return std::nullopt;
2729
ExtAddrMode AM;
2730
AM.BaseReg = Base->getReg();
2731
AM.Displacement = Offset;
2732
AM.ScaledReg = 0;
2733
AM.Scale = 0;
2734
return AM;
2735
}
2736
2737
bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2738
Register Reg,
2739
const MachineInstr &AddrI,
2740
ExtAddrMode &AM) const {
2741
// Filter out instructions into which we cannot fold.
2742
unsigned NumBytes;
2743
int64_t OffsetScale = 1;
2744
switch (MemI.getOpcode()) {
2745
default:
2746
return false;
2747
2748
case AArch64::LDURQi:
2749
case AArch64::STURQi:
2750
NumBytes = 16;
2751
break;
2752
2753
case AArch64::LDURDi:
2754
case AArch64::STURDi:
2755
case AArch64::LDURXi:
2756
case AArch64::STURXi:
2757
NumBytes = 8;
2758
break;
2759
2760
case AArch64::LDURWi:
2761
case AArch64::LDURSWi:
2762
case AArch64::STURWi:
2763
NumBytes = 4;
2764
break;
2765
2766
case AArch64::LDURHi:
2767
case AArch64::STURHi:
2768
case AArch64::LDURHHi:
2769
case AArch64::STURHHi:
2770
case AArch64::LDURSHXi:
2771
case AArch64::LDURSHWi:
2772
NumBytes = 2;
2773
break;
2774
2775
case AArch64::LDRBroX:
2776
case AArch64::LDRBBroX:
2777
case AArch64::LDRSBXroX:
2778
case AArch64::LDRSBWroX:
2779
case AArch64::STRBroX:
2780
case AArch64::STRBBroX:
2781
case AArch64::LDURBi:
2782
case AArch64::LDURBBi:
2783
case AArch64::LDURSBXi:
2784
case AArch64::LDURSBWi:
2785
case AArch64::STURBi:
2786
case AArch64::STURBBi:
2787
case AArch64::LDRBui:
2788
case AArch64::LDRBBui:
2789
case AArch64::LDRSBXui:
2790
case AArch64::LDRSBWui:
2791
case AArch64::STRBui:
2792
case AArch64::STRBBui:
2793
NumBytes = 1;
2794
break;
2795
2796
case AArch64::LDRQroX:
2797
case AArch64::STRQroX:
2798
case AArch64::LDRQui:
2799
case AArch64::STRQui:
2800
NumBytes = 16;
2801
OffsetScale = 16;
2802
break;
2803
2804
case AArch64::LDRDroX:
2805
case AArch64::STRDroX:
2806
case AArch64::LDRXroX:
2807
case AArch64::STRXroX:
2808
case AArch64::LDRDui:
2809
case AArch64::STRDui:
2810
case AArch64::LDRXui:
2811
case AArch64::STRXui:
2812
NumBytes = 8;
2813
OffsetScale = 8;
2814
break;
2815
2816
case AArch64::LDRWroX:
2817
case AArch64::LDRSWroX:
2818
case AArch64::STRWroX:
2819
case AArch64::LDRWui:
2820
case AArch64::LDRSWui:
2821
case AArch64::STRWui:
2822
NumBytes = 4;
2823
OffsetScale = 4;
2824
break;
2825
2826
case AArch64::LDRHroX:
2827
case AArch64::STRHroX:
2828
case AArch64::LDRHHroX:
2829
case AArch64::STRHHroX:
2830
case AArch64::LDRSHXroX:
2831
case AArch64::LDRSHWroX:
2832
case AArch64::LDRHui:
2833
case AArch64::STRHui:
2834
case AArch64::LDRHHui:
2835
case AArch64::STRHHui:
2836
case AArch64::LDRSHXui:
2837
case AArch64::LDRSHWui:
2838
NumBytes = 2;
2839
OffsetScale = 2;
2840
break;
2841
}
2842
2843
// Check the fold operand is not the loaded/stored value.
2844
const MachineOperand &BaseRegOp = MemI.getOperand(0);
2845
if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2846
return false;
2847
2848
// Handle memory instructions with a [Reg, Reg] addressing mode.
2849
if (MemI.getOperand(2).isReg()) {
2850
// Bail if the addressing mode already includes extension of the offset
2851
// register.
2852
if (MemI.getOperand(3).getImm())
2853
return false;
2854
2855
// Check if we actually have a scaled offset.
2856
if (MemI.getOperand(4).getImm() == 0)
2857
OffsetScale = 1;
2858
2859
// If the address instructions is folded into the base register, then the
2860
// addressing mode must not have a scale. Then we can swap the base and the
2861
// scaled registers.
2862
if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
2863
return false;
2864
2865
switch (AddrI.getOpcode()) {
2866
default:
2867
return false;
2868
2869
case AArch64::SBFMXri:
2870
// sxtw Xa, Wm
2871
// ldr Xd, [Xn, Xa, lsl #N]
2872
// ->
2873
// ldr Xd, [Xn, Wm, sxtw #N]
2874
if (AddrI.getOperand(2).getImm() != 0 ||
2875
AddrI.getOperand(3).getImm() != 31)
2876
return false;
2877
2878
AM.BaseReg = MemI.getOperand(1).getReg();
2879
if (AM.BaseReg == Reg)
2880
AM.BaseReg = MemI.getOperand(2).getReg();
2881
AM.ScaledReg = AddrI.getOperand(1).getReg();
2882
AM.Scale = OffsetScale;
2883
AM.Displacement = 0;
2884
AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2885
return true;
2886
2887
case TargetOpcode::SUBREG_TO_REG: {
2888
// mov Wa, Wm
2889
// ldr Xd, [Xn, Xa, lsl #N]
2890
// ->
2891
// ldr Xd, [Xn, Wm, uxtw #N]
2892
2893
// Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2894
if (AddrI.getOperand(1).getImm() != 0 ||
2895
AddrI.getOperand(3).getImm() != AArch64::sub_32)
2896
return false;
2897
2898
const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2899
Register OffsetReg = AddrI.getOperand(2).getReg();
2900
if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
2901
return false;
2902
2903
const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
2904
if (DefMI.getOpcode() != AArch64::ORRWrs ||
2905
DefMI.getOperand(1).getReg() != AArch64::WZR ||
2906
DefMI.getOperand(3).getImm() != 0)
2907
return false;
2908
2909
AM.BaseReg = MemI.getOperand(1).getReg();
2910
if (AM.BaseReg == Reg)
2911
AM.BaseReg = MemI.getOperand(2).getReg();
2912
AM.ScaledReg = DefMI.getOperand(2).getReg();
2913
AM.Scale = OffsetScale;
2914
AM.Displacement = 0;
2915
AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2916
return true;
2917
}
2918
}
2919
}
2920
2921
// Handle memory instructions with a [Reg, #Imm] addressing mode.
2922
2923
// Check we are not breaking a potential conversion to an LDP.
2924
auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2925
int64_t NewOffset) -> bool {
2926
int64_t MinOffset, MaxOffset;
2927
switch (NumBytes) {
2928
default:
2929
return true;
2930
case 4:
2931
MinOffset = -256;
2932
MaxOffset = 252;
2933
break;
2934
case 8:
2935
MinOffset = -512;
2936
MaxOffset = 504;
2937
break;
2938
case 16:
2939
MinOffset = -1024;
2940
MaxOffset = 1008;
2941
break;
2942
}
2943
return OldOffset < MinOffset || OldOffset > MaxOffset ||
2944
(NewOffset >= MinOffset && NewOffset <= MaxOffset);
2945
};
2946
auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2947
int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
2948
int64_t NewOffset = OldOffset + Disp;
2949
if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
2950
return false;
2951
// If the old offset would fit into an LDP, but the new offset wouldn't,
2952
// bail out.
2953
if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2954
return false;
2955
AM.BaseReg = AddrI.getOperand(1).getReg();
2956
AM.ScaledReg = 0;
2957
AM.Scale = 0;
2958
AM.Displacement = NewOffset;
2959
AM.Form = ExtAddrMode::Formula::Basic;
2960
return true;
2961
};
2962
2963
auto canFoldAddRegIntoAddrMode =
2964
[&](int64_t Scale,
2965
ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2966
if (MemI.getOperand(2).getImm() != 0)
2967
return false;
2968
if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2969
return false;
2970
AM.BaseReg = AddrI.getOperand(1).getReg();
2971
AM.ScaledReg = AddrI.getOperand(2).getReg();
2972
AM.Scale = Scale;
2973
AM.Displacement = 0;
2974
AM.Form = Form;
2975
return true;
2976
};
2977
2978
auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2979
unsigned Opcode = MemI.getOpcode();
2980
return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2981
Subtarget.isSTRQroSlow();
2982
};
2983
2984
int64_t Disp = 0;
2985
const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2986
switch (AddrI.getOpcode()) {
2987
default:
2988
return false;
2989
2990
case AArch64::ADDXri:
2991
// add Xa, Xn, #N
2992
// ldr Xd, [Xa, #M]
2993
// ->
2994
// ldr Xd, [Xn, #N'+M]
2995
Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2996
return canFoldAddSubImmIntoAddrMode(Disp);
2997
2998
case AArch64::SUBXri:
2999
// sub Xa, Xn, #N
3000
// ldr Xd, [Xa, #M]
3001
// ->
3002
// ldr Xd, [Xn, #N'+M]
3003
Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3004
return canFoldAddSubImmIntoAddrMode(-Disp);
3005
3006
case AArch64::ADDXrs: {
3007
// add Xa, Xn, Xm, lsl #N
3008
// ldr Xd, [Xa]
3009
// ->
3010
// ldr Xd, [Xn, Xm, lsl #N]
3011
3012
// Don't fold the add if the result would be slower, unless optimising for
3013
// size.
3014
unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3015
if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3016
return false;
3017
Shift = AArch64_AM::getShiftValue(Shift);
3018
if (!OptSize) {
3019
if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3020
return false;
3021
if (avoidSlowSTRQ(MemI))
3022
return false;
3023
}
3024
return canFoldAddRegIntoAddrMode(1ULL << Shift);
3025
}
3026
3027
case AArch64::ADDXrr:
3028
// add Xa, Xn, Xm
3029
// ldr Xd, [Xa]
3030
// ->
3031
// ldr Xd, [Xn, Xm, lsl #0]
3032
3033
// Don't fold the add if the result would be slower, unless optimising for
3034
// size.
3035
if (!OptSize && avoidSlowSTRQ(MemI))
3036
return false;
3037
return canFoldAddRegIntoAddrMode(1);
3038
3039
case AArch64::ADDXrx:
3040
// add Xa, Xn, Wm, {s,u}xtw #N
3041
// ldr Xd, [Xa]
3042
// ->
3043
// ldr Xd, [Xn, Wm, {s,u}xtw #N]
3044
3045
// Don't fold the add if the result would be slower, unless optimising for
3046
// size.
3047
if (!OptSize && avoidSlowSTRQ(MemI))
3048
return false;
3049
3050
// Can fold only sign-/zero-extend of a word.
3051
unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3052
AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3053
if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3054
return false;
3055
3056
return canFoldAddRegIntoAddrMode(
3057
1ULL << AArch64_AM::getArithShiftValue(Imm),
3058
(Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3059
: ExtAddrMode::Formula::ZExtScaledReg);
3060
}
3061
}
3062
3063
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3064
// return the opcode of an instruction performing the same operation, but using
3065
// the [Reg, Reg] addressing mode.
3066
static unsigned regOffsetOpcode(unsigned Opcode) {
3067
switch (Opcode) {
3068
default:
3069
llvm_unreachable("Address folding not implemented for instruction");
3070
3071
case AArch64::LDURQi:
3072
case AArch64::LDRQui:
3073
return AArch64::LDRQroX;
3074
case AArch64::STURQi:
3075
case AArch64::STRQui:
3076
return AArch64::STRQroX;
3077
case AArch64::LDURDi:
3078
case AArch64::LDRDui:
3079
return AArch64::LDRDroX;
3080
case AArch64::STURDi:
3081
case AArch64::STRDui:
3082
return AArch64::STRDroX;
3083
case AArch64::LDURXi:
3084
case AArch64::LDRXui:
3085
return AArch64::LDRXroX;
3086
case AArch64::STURXi:
3087
case AArch64::STRXui:
3088
return AArch64::STRXroX;
3089
case AArch64::LDURWi:
3090
case AArch64::LDRWui:
3091
return AArch64::LDRWroX;
3092
case AArch64::LDURSWi:
3093
case AArch64::LDRSWui:
3094
return AArch64::LDRSWroX;
3095
case AArch64::STURWi:
3096
case AArch64::STRWui:
3097
return AArch64::STRWroX;
3098
case AArch64::LDURHi:
3099
case AArch64::LDRHui:
3100
return AArch64::LDRHroX;
3101
case AArch64::STURHi:
3102
case AArch64::STRHui:
3103
return AArch64::STRHroX;
3104
case AArch64::LDURHHi:
3105
case AArch64::LDRHHui:
3106
return AArch64::LDRHHroX;
3107
case AArch64::STURHHi:
3108
case AArch64::STRHHui:
3109
return AArch64::STRHHroX;
3110
case AArch64::LDURSHXi:
3111
case AArch64::LDRSHXui:
3112
return AArch64::LDRSHXroX;
3113
case AArch64::LDURSHWi:
3114
case AArch64::LDRSHWui:
3115
return AArch64::LDRSHWroX;
3116
case AArch64::LDURBi:
3117
case AArch64::LDRBui:
3118
return AArch64::LDRBroX;
3119
case AArch64::LDURBBi:
3120
case AArch64::LDRBBui:
3121
return AArch64::LDRBBroX;
3122
case AArch64::LDURSBXi:
3123
case AArch64::LDRSBXui:
3124
return AArch64::LDRSBXroX;
3125
case AArch64::LDURSBWi:
3126
case AArch64::LDRSBWui:
3127
return AArch64::LDRSBWroX;
3128
case AArch64::STURBi:
3129
case AArch64::STRBui:
3130
return AArch64::STRBroX;
3131
case AArch64::STURBBi:
3132
case AArch64::STRBBui:
3133
return AArch64::STRBBroX;
3134
}
3135
}
3136
3137
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3138
// the opcode of an instruction performing the same operation, but using the
3139
// [Reg, #Imm] addressing mode with scaled offset.
3140
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3141
switch (Opcode) {
3142
default:
3143
llvm_unreachable("Address folding not implemented for instruction");
3144
3145
case AArch64::LDURQi:
3146
Scale = 16;
3147
return AArch64::LDRQui;
3148
case AArch64::STURQi:
3149
Scale = 16;
3150
return AArch64::STRQui;
3151
case AArch64::LDURDi:
3152
Scale = 8;
3153
return AArch64::LDRDui;
3154
case AArch64::STURDi:
3155
Scale = 8;
3156
return AArch64::STRDui;
3157
case AArch64::LDURXi:
3158
Scale = 8;
3159
return AArch64::LDRXui;
3160
case AArch64::STURXi:
3161
Scale = 8;
3162
return AArch64::STRXui;
3163
case AArch64::LDURWi:
3164
Scale = 4;
3165
return AArch64::LDRWui;
3166
case AArch64::LDURSWi:
3167
Scale = 4;
3168
return AArch64::LDRSWui;
3169
case AArch64::STURWi:
3170
Scale = 4;
3171
return AArch64::STRWui;
3172
case AArch64::LDURHi:
3173
Scale = 2;
3174
return AArch64::LDRHui;
3175
case AArch64::STURHi:
3176
Scale = 2;
3177
return AArch64::STRHui;
3178
case AArch64::LDURHHi:
3179
Scale = 2;
3180
return AArch64::LDRHHui;
3181
case AArch64::STURHHi:
3182
Scale = 2;
3183
return AArch64::STRHHui;
3184
case AArch64::LDURSHXi:
3185
Scale = 2;
3186
return AArch64::LDRSHXui;
3187
case AArch64::LDURSHWi:
3188
Scale = 2;
3189
return AArch64::LDRSHWui;
3190
case AArch64::LDURBi:
3191
Scale = 1;
3192
return AArch64::LDRBui;
3193
case AArch64::LDURBBi:
3194
Scale = 1;
3195
return AArch64::LDRBBui;
3196
case AArch64::LDURSBXi:
3197
Scale = 1;
3198
return AArch64::LDRSBXui;
3199
case AArch64::LDURSBWi:
3200
Scale = 1;
3201
return AArch64::LDRSBWui;
3202
case AArch64::STURBi:
3203
Scale = 1;
3204
return AArch64::STRBui;
3205
case AArch64::STURBBi:
3206
Scale = 1;
3207
return AArch64::STRBBui;
3208
case AArch64::LDRQui:
3209
case AArch64::STRQui:
3210
Scale = 16;
3211
return Opcode;
3212
case AArch64::LDRDui:
3213
case AArch64::STRDui:
3214
case AArch64::LDRXui:
3215
case AArch64::STRXui:
3216
Scale = 8;
3217
return Opcode;
3218
case AArch64::LDRWui:
3219
case AArch64::LDRSWui:
3220
case AArch64::STRWui:
3221
Scale = 4;
3222
return Opcode;
3223
case AArch64::LDRHui:
3224
case AArch64::STRHui:
3225
case AArch64::LDRHHui:
3226
case AArch64::STRHHui:
3227
case AArch64::LDRSHXui:
3228
case AArch64::LDRSHWui:
3229
Scale = 2;
3230
return Opcode;
3231
case AArch64::LDRBui:
3232
case AArch64::LDRBBui:
3233
case AArch64::LDRSBXui:
3234
case AArch64::LDRSBWui:
3235
case AArch64::STRBui:
3236
case AArch64::STRBBui:
3237
Scale = 1;
3238
return Opcode;
3239
}
3240
}
3241
3242
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3243
// the opcode of an instruction performing the same operation, but using the
3244
// [Reg, #Imm] addressing mode with unscaled offset.
3245
unsigned unscaledOffsetOpcode(unsigned Opcode) {
3246
switch (Opcode) {
3247
default:
3248
llvm_unreachable("Address folding not implemented for instruction");
3249
3250
case AArch64::LDURQi:
3251
case AArch64::STURQi:
3252
case AArch64::LDURDi:
3253
case AArch64::STURDi:
3254
case AArch64::LDURXi:
3255
case AArch64::STURXi:
3256
case AArch64::LDURWi:
3257
case AArch64::LDURSWi:
3258
case AArch64::STURWi:
3259
case AArch64::LDURHi:
3260
case AArch64::STURHi:
3261
case AArch64::LDURHHi:
3262
case AArch64::STURHHi:
3263
case AArch64::LDURSHXi:
3264
case AArch64::LDURSHWi:
3265
case AArch64::LDURBi:
3266
case AArch64::STURBi:
3267
case AArch64::LDURBBi:
3268
case AArch64::STURBBi:
3269
case AArch64::LDURSBWi:
3270
case AArch64::LDURSBXi:
3271
return Opcode;
3272
case AArch64::LDRQui:
3273
return AArch64::LDURQi;
3274
case AArch64::STRQui:
3275
return AArch64::STURQi;
3276
case AArch64::LDRDui:
3277
return AArch64::LDURDi;
3278
case AArch64::STRDui:
3279
return AArch64::STURDi;
3280
case AArch64::LDRXui:
3281
return AArch64::LDURXi;
3282
case AArch64::STRXui:
3283
return AArch64::STURXi;
3284
case AArch64::LDRWui:
3285
return AArch64::LDURWi;
3286
case AArch64::LDRSWui:
3287
return AArch64::LDURSWi;
3288
case AArch64::STRWui:
3289
return AArch64::STURWi;
3290
case AArch64::LDRHui:
3291
return AArch64::LDURHi;
3292
case AArch64::STRHui:
3293
return AArch64::STURHi;
3294
case AArch64::LDRHHui:
3295
return AArch64::LDURHHi;
3296
case AArch64::STRHHui:
3297
return AArch64::STURHHi;
3298
case AArch64::LDRSHXui:
3299
return AArch64::LDURSHXi;
3300
case AArch64::LDRSHWui:
3301
return AArch64::LDURSHWi;
3302
case AArch64::LDRBBui:
3303
return AArch64::LDURBBi;
3304
case AArch64::LDRBui:
3305
return AArch64::LDURBi;
3306
case AArch64::STRBBui:
3307
return AArch64::STURBBi;
3308
case AArch64::STRBui:
3309
return AArch64::STURBi;
3310
case AArch64::LDRSBWui:
3311
return AArch64::LDURSBWi;
3312
case AArch64::LDRSBXui:
3313
return AArch64::LDURSBXi;
3314
}
3315
}
3316
3317
// Given the opcode of a memory load/store instruction, return the opcode of an
3318
// instruction performing the same operation, but using
3319
// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3320
// offset register.
3321
static unsigned offsetExtendOpcode(unsigned Opcode) {
3322
switch (Opcode) {
3323
default:
3324
llvm_unreachable("Address folding not implemented for instruction");
3325
3326
case AArch64::LDRQroX:
3327
case AArch64::LDURQi:
3328
case AArch64::LDRQui:
3329
return AArch64::LDRQroW;
3330
case AArch64::STRQroX:
3331
case AArch64::STURQi:
3332
case AArch64::STRQui:
3333
return AArch64::STRQroW;
3334
case AArch64::LDRDroX:
3335
case AArch64::LDURDi:
3336
case AArch64::LDRDui:
3337
return AArch64::LDRDroW;
3338
case AArch64::STRDroX:
3339
case AArch64::STURDi:
3340
case AArch64::STRDui:
3341
return AArch64::STRDroW;
3342
case AArch64::LDRXroX:
3343
case AArch64::LDURXi:
3344
case AArch64::LDRXui:
3345
return AArch64::LDRXroW;
3346
case AArch64::STRXroX:
3347
case AArch64::STURXi:
3348
case AArch64::STRXui:
3349
return AArch64::STRXroW;
3350
case AArch64::LDRWroX:
3351
case AArch64::LDURWi:
3352
case AArch64::LDRWui:
3353
return AArch64::LDRWroW;
3354
case AArch64::LDRSWroX:
3355
case AArch64::LDURSWi:
3356
case AArch64::LDRSWui:
3357
return AArch64::LDRSWroW;
3358
case AArch64::STRWroX:
3359
case AArch64::STURWi:
3360
case AArch64::STRWui:
3361
return AArch64::STRWroW;
3362
case AArch64::LDRHroX:
3363
case AArch64::LDURHi:
3364
case AArch64::LDRHui:
3365
return AArch64::LDRHroW;
3366
case AArch64::STRHroX:
3367
case AArch64::STURHi:
3368
case AArch64::STRHui:
3369
return AArch64::STRHroW;
3370
case AArch64::LDRHHroX:
3371
case AArch64::LDURHHi:
3372
case AArch64::LDRHHui:
3373
return AArch64::LDRHHroW;
3374
case AArch64::STRHHroX:
3375
case AArch64::STURHHi:
3376
case AArch64::STRHHui:
3377
return AArch64::STRHHroW;
3378
case AArch64::LDRSHXroX:
3379
case AArch64::LDURSHXi:
3380
case AArch64::LDRSHXui:
3381
return AArch64::LDRSHXroW;
3382
case AArch64::LDRSHWroX:
3383
case AArch64::LDURSHWi:
3384
case AArch64::LDRSHWui:
3385
return AArch64::LDRSHWroW;
3386
case AArch64::LDRBroX:
3387
case AArch64::LDURBi:
3388
case AArch64::LDRBui:
3389
return AArch64::LDRBroW;
3390
case AArch64::LDRBBroX:
3391
case AArch64::LDURBBi:
3392
case AArch64::LDRBBui:
3393
return AArch64::LDRBBroW;
3394
case AArch64::LDRSBXroX:
3395
case AArch64::LDURSBXi:
3396
case AArch64::LDRSBXui:
3397
return AArch64::LDRSBXroW;
3398
case AArch64::LDRSBWroX:
3399
case AArch64::LDURSBWi:
3400
case AArch64::LDRSBWui:
3401
return AArch64::LDRSBWroW;
3402
case AArch64::STRBroX:
3403
case AArch64::STURBi:
3404
case AArch64::STRBui:
3405
return AArch64::STRBroW;
3406
case AArch64::STRBBroX:
3407
case AArch64::STURBBi:
3408
case AArch64::STRBBui:
3409
return AArch64::STRBBroW;
3410
}
3411
}
3412
3413
MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3414
const ExtAddrMode &AM) const {
3415
3416
const DebugLoc &DL = MemI.getDebugLoc();
3417
MachineBasicBlock &MBB = *MemI.getParent();
3418
MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3419
3420
if (AM.Form == ExtAddrMode::Formula::Basic) {
3421
if (AM.ScaledReg) {
3422
// The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3423
unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3424
MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3425
auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3426
.addReg(MemI.getOperand(0).getReg(),
3427
MemI.mayLoad() ? RegState::Define : 0)
3428
.addReg(AM.BaseReg)
3429
.addReg(AM.ScaledReg)
3430
.addImm(0)
3431
.addImm(AM.Scale > 1)
3432
.setMemRefs(MemI.memoperands())
3433
.setMIFlags(MemI.getFlags());
3434
return B.getInstr();
3435
}
3436
3437
assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3438
"Addressing mode not supported for folding");
3439
3440
// The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3441
unsigned Scale = 1;
3442
unsigned Opcode = MemI.getOpcode();
3443
if (isInt<9>(AM.Displacement))
3444
Opcode = unscaledOffsetOpcode(Opcode);
3445
else
3446
Opcode = scaledOffsetOpcode(Opcode, Scale);
3447
3448
auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3449
.addReg(MemI.getOperand(0).getReg(),
3450
MemI.mayLoad() ? RegState::Define : 0)
3451
.addReg(AM.BaseReg)
3452
.addImm(AM.Displacement / Scale)
3453
.setMemRefs(MemI.memoperands())
3454
.setMIFlags(MemI.getFlags());
3455
return B.getInstr();
3456
}
3457
3458
if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3459
AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3460
// The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3461
assert(AM.ScaledReg && !AM.Displacement &&
3462
"Address offset can be a register or an immediate, but not both");
3463
unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3464
MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3465
// Make sure the offset register is in the correct register class.
3466
Register OffsetReg = AM.ScaledReg;
3467
const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3468
if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3469
OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3470
BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3471
.addReg(AM.ScaledReg, 0, AArch64::sub_32);
3472
}
3473
auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3474
.addReg(MemI.getOperand(0).getReg(),
3475
MemI.mayLoad() ? RegState::Define : 0)
3476
.addReg(AM.BaseReg)
3477
.addReg(OffsetReg)
3478
.addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3479
.addImm(AM.Scale != 1)
3480
.setMemRefs(MemI.memoperands())
3481
.setMIFlags(MemI.getFlags());
3482
3483
return B.getInstr();
3484
}
3485
3486
llvm_unreachable(
3487
"Function must not be called with an addressing mode it can't handle");
3488
}
3489
3490
bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3491
const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3492
bool &OffsetIsScalable, TypeSize &Width,
3493
const TargetRegisterInfo *TRI) const {
3494
assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3495
// Handle only loads/stores with base register followed by immediate offset.
3496
if (LdSt.getNumExplicitOperands() == 3) {
3497
// Non-paired instruction (e.g., ldr x1, [x0, #8]).
3498
if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3499
!LdSt.getOperand(2).isImm())
3500
return false;
3501
} else if (LdSt.getNumExplicitOperands() == 4) {
3502
// Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3503
if (!LdSt.getOperand(1).isReg() ||
3504
(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3505
!LdSt.getOperand(3).isImm())
3506
return false;
3507
} else
3508
return false;
3509
3510
// Get the scaling factor for the instruction and set the width for the
3511
// instruction.
3512
TypeSize Scale(0U, false);
3513
int64_t Dummy1, Dummy2;
3514
3515
// If this returns false, then it's an instruction we don't want to handle.
3516
if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3517
return false;
3518
3519
// Compute the offset. Offset is calculated as the immediate operand
3520
// multiplied by the scaling factor. Unscaled instructions have scaling factor
3521
// set to 1.
3522
if (LdSt.getNumExplicitOperands() == 3) {
3523
BaseOp = &LdSt.getOperand(1);
3524
Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3525
} else {
3526
assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3527
BaseOp = &LdSt.getOperand(2);
3528
Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3529
}
3530
OffsetIsScalable = Scale.isScalable();
3531
3532
if (!BaseOp->isReg() && !BaseOp->isFI())
3533
return false;
3534
3535
return true;
3536
}
3537
3538
MachineOperand &
3539
AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3540
assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3541
MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3542
assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3543
return OfsOp;
3544
}
3545
3546
bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3547
TypeSize &Width, int64_t &MinOffset,
3548
int64_t &MaxOffset) {
3549
switch (Opcode) {
3550
// Not a memory operation or something we want to handle.
3551
default:
3552
Scale = TypeSize::getFixed(0);
3553
Width = TypeSize::getFixed(0);
3554
MinOffset = MaxOffset = 0;
3555
return false;
3556
// LDR / STR
3557
case AArch64::LDRQui:
3558
case AArch64::STRQui:
3559
Scale = TypeSize::getFixed(16);
3560
Width = TypeSize::getFixed(16);
3561
MinOffset = 0;
3562
MaxOffset = 4095;
3563
break;
3564
case AArch64::LDRXui:
3565
case AArch64::LDRDui:
3566
case AArch64::STRXui:
3567
case AArch64::STRDui:
3568
case AArch64::PRFMui:
3569
Scale = TypeSize::getFixed(8);
3570
Width = TypeSize::getFixed(8);
3571
MinOffset = 0;
3572
MaxOffset = 4095;
3573
break;
3574
case AArch64::LDRWui:
3575
case AArch64::LDRSui:
3576
case AArch64::LDRSWui:
3577
case AArch64::STRWui:
3578
case AArch64::STRSui:
3579
Scale = TypeSize::getFixed(4);
3580
Width = TypeSize::getFixed(4);
3581
MinOffset = 0;
3582
MaxOffset = 4095;
3583
break;
3584
case AArch64::LDRHui:
3585
case AArch64::LDRHHui:
3586
case AArch64::LDRSHWui:
3587
case AArch64::LDRSHXui:
3588
case AArch64::STRHui:
3589
case AArch64::STRHHui:
3590
Scale = TypeSize::getFixed(2);
3591
Width = TypeSize::getFixed(2);
3592
MinOffset = 0;
3593
MaxOffset = 4095;
3594
break;
3595
case AArch64::LDRBui:
3596
case AArch64::LDRBBui:
3597
case AArch64::LDRSBWui:
3598
case AArch64::LDRSBXui:
3599
case AArch64::STRBui:
3600
case AArch64::STRBBui:
3601
Scale = TypeSize::getFixed(1);
3602
Width = TypeSize::getFixed(1);
3603
MinOffset = 0;
3604
MaxOffset = 4095;
3605
break;
3606
// post/pre inc
3607
case AArch64::STRQpre:
3608
case AArch64::LDRQpost:
3609
Scale = TypeSize::getFixed(1);
3610
Width = TypeSize::getFixed(16);
3611
MinOffset = -256;
3612
MaxOffset = 255;
3613
break;
3614
case AArch64::STRXpre:
3615
case AArch64::STRDpre:
3616
case AArch64::LDRXpost:
3617
case AArch64::LDRDpost:
3618
Scale = TypeSize::getFixed(1);
3619
Width = TypeSize::getFixed(8);
3620
MinOffset = -256;
3621
MaxOffset = 255;
3622
break;
3623
case AArch64::STRWpost:
3624
case AArch64::LDRWpost:
3625
Scale = TypeSize::getFixed(4);
3626
Width = TypeSize::getFixed(32);
3627
MinOffset = -256;
3628
MaxOffset = 255;
3629
break;
3630
// Unscaled
3631
case AArch64::LDURQi:
3632
case AArch64::STURQi:
3633
Scale = TypeSize::getFixed(1);
3634
Width = TypeSize::getFixed(16);
3635
MinOffset = -256;
3636
MaxOffset = 255;
3637
break;
3638
case AArch64::LDURXi:
3639
case AArch64::LDURDi:
3640
case AArch64::LDAPURXi:
3641
case AArch64::STURXi:
3642
case AArch64::STURDi:
3643
case AArch64::STLURXi:
3644
case AArch64::PRFUMi:
3645
Scale = TypeSize::getFixed(1);
3646
Width = TypeSize::getFixed(8);
3647
MinOffset = -256;
3648
MaxOffset = 255;
3649
break;
3650
case AArch64::LDURWi:
3651
case AArch64::LDURSi:
3652
case AArch64::LDURSWi:
3653
case AArch64::LDAPURi:
3654
case AArch64::LDAPURSWi:
3655
case AArch64::STURWi:
3656
case AArch64::STURSi:
3657
case AArch64::STLURWi:
3658
Scale = TypeSize::getFixed(1);
3659
Width = TypeSize::getFixed(4);
3660
MinOffset = -256;
3661
MaxOffset = 255;
3662
break;
3663
case AArch64::LDURHi:
3664
case AArch64::LDURHHi:
3665
case AArch64::LDURSHXi:
3666
case AArch64::LDURSHWi:
3667
case AArch64::LDAPURHi:
3668
case AArch64::LDAPURSHWi:
3669
case AArch64::LDAPURSHXi:
3670
case AArch64::STURHi:
3671
case AArch64::STURHHi:
3672
case AArch64::STLURHi:
3673
Scale = TypeSize::getFixed(1);
3674
Width = TypeSize::getFixed(2);
3675
MinOffset = -256;
3676
MaxOffset = 255;
3677
break;
3678
case AArch64::LDURBi:
3679
case AArch64::LDURBBi:
3680
case AArch64::LDURSBXi:
3681
case AArch64::LDURSBWi:
3682
case AArch64::LDAPURBi:
3683
case AArch64::LDAPURSBWi:
3684
case AArch64::LDAPURSBXi:
3685
case AArch64::STURBi:
3686
case AArch64::STURBBi:
3687
case AArch64::STLURBi:
3688
Scale = TypeSize::getFixed(1);
3689
Width = TypeSize::getFixed(1);
3690
MinOffset = -256;
3691
MaxOffset = 255;
3692
break;
3693
// LDP / STP
3694
case AArch64::LDPQi:
3695
case AArch64::LDNPQi:
3696
case AArch64::STPQi:
3697
case AArch64::STNPQi:
3698
Scale = TypeSize::getFixed(16);
3699
Width = TypeSize::getFixed(32);
3700
MinOffset = -64;
3701
MaxOffset = 63;
3702
break;
3703
case AArch64::LDPXi:
3704
case AArch64::LDPDi:
3705
case AArch64::LDNPXi:
3706
case AArch64::LDNPDi:
3707
case AArch64::STPXi:
3708
case AArch64::STPDi:
3709
case AArch64::STNPXi:
3710
case AArch64::STNPDi:
3711
Scale = TypeSize::getFixed(8);
3712
Width = TypeSize::getFixed(16);
3713
MinOffset = -64;
3714
MaxOffset = 63;
3715
break;
3716
case AArch64::LDPWi:
3717
case AArch64::LDPSi:
3718
case AArch64::LDNPWi:
3719
case AArch64::LDNPSi:
3720
case AArch64::STPWi:
3721
case AArch64::STPSi:
3722
case AArch64::STNPWi:
3723
case AArch64::STNPSi:
3724
Scale = TypeSize::getFixed(4);
3725
Width = TypeSize::getFixed(8);
3726
MinOffset = -64;
3727
MaxOffset = 63;
3728
break;
3729
// pre/post inc
3730
case AArch64::STPQpre:
3731
case AArch64::LDPQpost:
3732
Scale = TypeSize::getFixed(16);
3733
Width = TypeSize::getFixed(16);
3734
MinOffset = -1024;
3735
MaxOffset = 1008;
3736
break;
3737
case AArch64::STPXpre:
3738
case AArch64::LDPXpost:
3739
case AArch64::STPDpre:
3740
case AArch64::LDPDpost:
3741
Scale = TypeSize::getFixed(8);
3742
Width = TypeSize::getFixed(8);
3743
MinOffset = -512;
3744
MaxOffset = 504;
3745
break;
3746
case AArch64::StoreSwiftAsyncContext:
3747
// Store is an STRXui, but there might be an ADDXri in the expansion too.
3748
Scale = TypeSize::getFixed(1);
3749
Width = TypeSize::getFixed(8);
3750
MinOffset = 0;
3751
MaxOffset = 4095;
3752
break;
3753
case AArch64::ADDG:
3754
Scale = TypeSize::getFixed(16);
3755
Width = TypeSize::getFixed(0);
3756
MinOffset = 0;
3757
MaxOffset = 63;
3758
break;
3759
case AArch64::TAGPstack:
3760
Scale = TypeSize::getFixed(16);
3761
Width = TypeSize::getFixed(0);
3762
// TAGP with a negative offset turns into SUBP, which has a maximum offset
3763
// of 63 (not 64!).
3764
MinOffset = -63;
3765
MaxOffset = 63;
3766
break;
3767
case AArch64::LDG:
3768
case AArch64::STGi:
3769
case AArch64::STZGi:
3770
Scale = TypeSize::getFixed(16);
3771
Width = TypeSize::getFixed(16);
3772
MinOffset = -256;
3773
MaxOffset = 255;
3774
break;
3775
// SVE
3776
case AArch64::STR_ZZZZXI:
3777
case AArch64::LDR_ZZZZXI:
3778
Scale = TypeSize::getScalable(16);
3779
Width = TypeSize::getScalable(16 * 4);
3780
MinOffset = -256;
3781
MaxOffset = 252;
3782
break;
3783
case AArch64::STR_ZZZXI:
3784
case AArch64::LDR_ZZZXI:
3785
Scale = TypeSize::getScalable(16);
3786
Width = TypeSize::getScalable(16 * 3);
3787
MinOffset = -256;
3788
MaxOffset = 253;
3789
break;
3790
case AArch64::STR_ZZXI:
3791
case AArch64::LDR_ZZXI:
3792
Scale = TypeSize::getScalable(16);
3793
Width = TypeSize::getScalable(16 * 2);
3794
MinOffset = -256;
3795
MaxOffset = 254;
3796
break;
3797
case AArch64::LDR_PXI:
3798
case AArch64::STR_PXI:
3799
Scale = TypeSize::getScalable(2);
3800
Width = TypeSize::getScalable(2);
3801
MinOffset = -256;
3802
MaxOffset = 255;
3803
break;
3804
case AArch64::LDR_PPXI:
3805
case AArch64::STR_PPXI:
3806
Scale = TypeSize::getScalable(2);
3807
Width = TypeSize::getScalable(2 * 2);
3808
MinOffset = -256;
3809
MaxOffset = 254;
3810
break;
3811
case AArch64::LDR_ZXI:
3812
case AArch64::STR_ZXI:
3813
Scale = TypeSize::getScalable(16);
3814
Width = TypeSize::getScalable(16);
3815
MinOffset = -256;
3816
MaxOffset = 255;
3817
break;
3818
case AArch64::LD1B_IMM:
3819
case AArch64::LD1H_IMM:
3820
case AArch64::LD1W_IMM:
3821
case AArch64::LD1D_IMM:
3822
case AArch64::LDNT1B_ZRI:
3823
case AArch64::LDNT1H_ZRI:
3824
case AArch64::LDNT1W_ZRI:
3825
case AArch64::LDNT1D_ZRI:
3826
case AArch64::ST1B_IMM:
3827
case AArch64::ST1H_IMM:
3828
case AArch64::ST1W_IMM:
3829
case AArch64::ST1D_IMM:
3830
case AArch64::STNT1B_ZRI:
3831
case AArch64::STNT1H_ZRI:
3832
case AArch64::STNT1W_ZRI:
3833
case AArch64::STNT1D_ZRI:
3834
case AArch64::LDNF1B_IMM:
3835
case AArch64::LDNF1H_IMM:
3836
case AArch64::LDNF1W_IMM:
3837
case AArch64::LDNF1D_IMM:
3838
// A full vectors worth of data
3839
// Width = mbytes * elements
3840
Scale = TypeSize::getScalable(16);
3841
Width = TypeSize::getScalable(16);
3842
MinOffset = -8;
3843
MaxOffset = 7;
3844
break;
3845
case AArch64::LD2B_IMM:
3846
case AArch64::LD2H_IMM:
3847
case AArch64::LD2W_IMM:
3848
case AArch64::LD2D_IMM:
3849
case AArch64::ST2B_IMM:
3850
case AArch64::ST2H_IMM:
3851
case AArch64::ST2W_IMM:
3852
case AArch64::ST2D_IMM:
3853
Scale = TypeSize::getScalable(32);
3854
Width = TypeSize::getScalable(16 * 2);
3855
MinOffset = -8;
3856
MaxOffset = 7;
3857
break;
3858
case AArch64::LD3B_IMM:
3859
case AArch64::LD3H_IMM:
3860
case AArch64::LD3W_IMM:
3861
case AArch64::LD3D_IMM:
3862
case AArch64::ST3B_IMM:
3863
case AArch64::ST3H_IMM:
3864
case AArch64::ST3W_IMM:
3865
case AArch64::ST3D_IMM:
3866
Scale = TypeSize::getScalable(48);
3867
Width = TypeSize::getScalable(16 * 3);
3868
MinOffset = -8;
3869
MaxOffset = 7;
3870
break;
3871
case AArch64::LD4B_IMM:
3872
case AArch64::LD4H_IMM:
3873
case AArch64::LD4W_IMM:
3874
case AArch64::LD4D_IMM:
3875
case AArch64::ST4B_IMM:
3876
case AArch64::ST4H_IMM:
3877
case AArch64::ST4W_IMM:
3878
case AArch64::ST4D_IMM:
3879
Scale = TypeSize::getScalable(64);
3880
Width = TypeSize::getScalable(16 * 4);
3881
MinOffset = -8;
3882
MaxOffset = 7;
3883
break;
3884
case AArch64::LD1B_H_IMM:
3885
case AArch64::LD1SB_H_IMM:
3886
case AArch64::LD1H_S_IMM:
3887
case AArch64::LD1SH_S_IMM:
3888
case AArch64::LD1W_D_IMM:
3889
case AArch64::LD1SW_D_IMM:
3890
case AArch64::ST1B_H_IMM:
3891
case AArch64::ST1H_S_IMM:
3892
case AArch64::ST1W_D_IMM:
3893
case AArch64::LDNF1B_H_IMM:
3894
case AArch64::LDNF1SB_H_IMM:
3895
case AArch64::LDNF1H_S_IMM:
3896
case AArch64::LDNF1SH_S_IMM:
3897
case AArch64::LDNF1W_D_IMM:
3898
case AArch64::LDNF1SW_D_IMM:
3899
// A half vector worth of data
3900
// Width = mbytes * elements
3901
Scale = TypeSize::getScalable(8);
3902
Width = TypeSize::getScalable(8);
3903
MinOffset = -8;
3904
MaxOffset = 7;
3905
break;
3906
case AArch64::LD1B_S_IMM:
3907
case AArch64::LD1SB_S_IMM:
3908
case AArch64::LD1H_D_IMM:
3909
case AArch64::LD1SH_D_IMM:
3910
case AArch64::ST1B_S_IMM:
3911
case AArch64::ST1H_D_IMM:
3912
case AArch64::LDNF1B_S_IMM:
3913
case AArch64::LDNF1SB_S_IMM:
3914
case AArch64::LDNF1H_D_IMM:
3915
case AArch64::LDNF1SH_D_IMM:
3916
// A quarter vector worth of data
3917
// Width = mbytes * elements
3918
Scale = TypeSize::getScalable(4);
3919
Width = TypeSize::getScalable(4);
3920
MinOffset = -8;
3921
MaxOffset = 7;
3922
break;
3923
case AArch64::LD1B_D_IMM:
3924
case AArch64::LD1SB_D_IMM:
3925
case AArch64::ST1B_D_IMM:
3926
case AArch64::LDNF1B_D_IMM:
3927
case AArch64::LDNF1SB_D_IMM:
3928
// A eighth vector worth of data
3929
// Width = mbytes * elements
3930
Scale = TypeSize::getScalable(2);
3931
Width = TypeSize::getScalable(2);
3932
MinOffset = -8;
3933
MaxOffset = 7;
3934
break;
3935
case AArch64::ST2Gi:
3936
case AArch64::STZ2Gi:
3937
Scale = TypeSize::getFixed(16);
3938
Width = TypeSize::getFixed(32);
3939
MinOffset = -256;
3940
MaxOffset = 255;
3941
break;
3942
case AArch64::STGPi:
3943
Scale = TypeSize::getFixed(16);
3944
Width = TypeSize::getFixed(16);
3945
MinOffset = -64;
3946
MaxOffset = 63;
3947
break;
3948
case AArch64::LD1RB_IMM:
3949
case AArch64::LD1RB_H_IMM:
3950
case AArch64::LD1RB_S_IMM:
3951
case AArch64::LD1RB_D_IMM:
3952
case AArch64::LD1RSB_H_IMM:
3953
case AArch64::LD1RSB_S_IMM:
3954
case AArch64::LD1RSB_D_IMM:
3955
Scale = TypeSize::getFixed(1);
3956
Width = TypeSize::getFixed(1);
3957
MinOffset = 0;
3958
MaxOffset = 63;
3959
break;
3960
case AArch64::LD1RH_IMM:
3961
case AArch64::LD1RH_S_IMM:
3962
case AArch64::LD1RH_D_IMM:
3963
case AArch64::LD1RSH_S_IMM:
3964
case AArch64::LD1RSH_D_IMM:
3965
Scale = TypeSize::getFixed(2);
3966
Width = TypeSize::getFixed(2);
3967
MinOffset = 0;
3968
MaxOffset = 63;
3969
break;
3970
case AArch64::LD1RW_IMM:
3971
case AArch64::LD1RW_D_IMM:
3972
case AArch64::LD1RSW_IMM:
3973
Scale = TypeSize::getFixed(4);
3974
Width = TypeSize::getFixed(4);
3975
MinOffset = 0;
3976
MaxOffset = 63;
3977
break;
3978
case AArch64::LD1RD_IMM:
3979
Scale = TypeSize::getFixed(8);
3980
Width = TypeSize::getFixed(8);
3981
MinOffset = 0;
3982
MaxOffset = 63;
3983
break;
3984
}
3985
3986
return true;
3987
}
3988
3989
// Scaling factor for unscaled load or store.
3990
int AArch64InstrInfo::getMemScale(unsigned Opc) {
3991
switch (Opc) {
3992
default:
3993
llvm_unreachable("Opcode has unknown scale!");
3994
case AArch64::LDRBBui:
3995
case AArch64::LDURBBi:
3996
case AArch64::LDRSBWui:
3997
case AArch64::LDURSBWi:
3998
case AArch64::STRBBui:
3999
case AArch64::STURBBi:
4000
return 1;
4001
case AArch64::LDRHHui:
4002
case AArch64::LDURHHi:
4003
case AArch64::LDRSHWui:
4004
case AArch64::LDURSHWi:
4005
case AArch64::STRHHui:
4006
case AArch64::STURHHi:
4007
return 2;
4008
case AArch64::LDRSui:
4009
case AArch64::LDURSi:
4010
case AArch64::LDRSpre:
4011
case AArch64::LDRSWui:
4012
case AArch64::LDURSWi:
4013
case AArch64::LDRSWpre:
4014
case AArch64::LDRWpre:
4015
case AArch64::LDRWui:
4016
case AArch64::LDURWi:
4017
case AArch64::STRSui:
4018
case AArch64::STURSi:
4019
case AArch64::STRSpre:
4020
case AArch64::STRWui:
4021
case AArch64::STURWi:
4022
case AArch64::STRWpre:
4023
case AArch64::LDPSi:
4024
case AArch64::LDPSWi:
4025
case AArch64::LDPWi:
4026
case AArch64::STPSi:
4027
case AArch64::STPWi:
4028
return 4;
4029
case AArch64::LDRDui:
4030
case AArch64::LDURDi:
4031
case AArch64::LDRDpre:
4032
case AArch64::LDRXui:
4033
case AArch64::LDURXi:
4034
case AArch64::LDRXpre:
4035
case AArch64::STRDui:
4036
case AArch64::STURDi:
4037
case AArch64::STRDpre:
4038
case AArch64::STRXui:
4039
case AArch64::STURXi:
4040
case AArch64::STRXpre:
4041
case AArch64::LDPDi:
4042
case AArch64::LDPXi:
4043
case AArch64::STPDi:
4044
case AArch64::STPXi:
4045
return 8;
4046
case AArch64::LDRQui:
4047
case AArch64::LDURQi:
4048
case AArch64::STRQui:
4049
case AArch64::STURQi:
4050
case AArch64::STRQpre:
4051
case AArch64::LDPQi:
4052
case AArch64::LDRQpre:
4053
case AArch64::STPQi:
4054
case AArch64::STGi:
4055
case AArch64::STZGi:
4056
case AArch64::ST2Gi:
4057
case AArch64::STZ2Gi:
4058
case AArch64::STGPi:
4059
return 16;
4060
}
4061
}
4062
4063
bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4064
switch (MI.getOpcode()) {
4065
default:
4066
return false;
4067
case AArch64::LDRWpre:
4068
case AArch64::LDRXpre:
4069
case AArch64::LDRSWpre:
4070
case AArch64::LDRSpre:
4071
case AArch64::LDRDpre:
4072
case AArch64::LDRQpre:
4073
return true;
4074
}
4075
}
4076
4077
bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4078
switch (MI.getOpcode()) {
4079
default:
4080
return false;
4081
case AArch64::STRWpre:
4082
case AArch64::STRXpre:
4083
case AArch64::STRSpre:
4084
case AArch64::STRDpre:
4085
case AArch64::STRQpre:
4086
return true;
4087
}
4088
}
4089
4090
bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4091
return isPreLd(MI) || isPreSt(MI);
4092
}
4093
4094
bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4095
switch (MI.getOpcode()) {
4096
default:
4097
return false;
4098
case AArch64::LDPSi:
4099
case AArch64::LDPSWi:
4100
case AArch64::LDPDi:
4101
case AArch64::LDPQi:
4102
case AArch64::LDPWi:
4103
case AArch64::LDPXi:
4104
case AArch64::STPSi:
4105
case AArch64::STPDi:
4106
case AArch64::STPQi:
4107
case AArch64::STPWi:
4108
case AArch64::STPXi:
4109
case AArch64::STGPi:
4110
return true;
4111
}
4112
}
4113
4114
const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4115
unsigned Idx =
4116
AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4117
: 1;
4118
return MI.getOperand(Idx);
4119
}
4120
4121
const MachineOperand &
4122
AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4123
unsigned Idx =
4124
AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4125
: 2;
4126
return MI.getOperand(Idx);
4127
}
4128
4129
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4130
Register Reg) {
4131
if (MI.getParent() == nullptr)
4132
return nullptr;
4133
const MachineFunction *MF = MI.getParent()->getParent();
4134
return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4135
}
4136
4137
bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4138
auto IsHFPR = [&](const MachineOperand &Op) {
4139
if (!Op.isReg())
4140
return false;
4141
auto Reg = Op.getReg();
4142
if (Reg.isPhysical())
4143
return AArch64::FPR16RegClass.contains(Reg);
4144
const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4145
return TRC == &AArch64::FPR16RegClass ||
4146
TRC == &AArch64::FPR16_loRegClass;
4147
};
4148
return llvm::any_of(MI.operands(), IsHFPR);
4149
}
4150
4151
bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4152
auto IsQFPR = [&](const MachineOperand &Op) {
4153
if (!Op.isReg())
4154
return false;
4155
auto Reg = Op.getReg();
4156
if (Reg.isPhysical())
4157
return AArch64::FPR128RegClass.contains(Reg);
4158
const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4159
return TRC == &AArch64::FPR128RegClass ||
4160
TRC == &AArch64::FPR128_loRegClass;
4161
};
4162
return llvm::any_of(MI.operands(), IsQFPR);
4163
}
4164
4165
bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4166
switch (MI.getOpcode()) {
4167
case AArch64::BRK:
4168
case AArch64::HLT:
4169
case AArch64::PACIASP:
4170
case AArch64::PACIBSP:
4171
// Implicit BTI behavior.
4172
return true;
4173
case AArch64::PAUTH_PROLOGUE:
4174
// PAUTH_PROLOGUE expands to PACI(A|B)SP.
4175
return true;
4176
case AArch64::HINT: {
4177
unsigned Imm = MI.getOperand(0).getImm();
4178
// Explicit BTI instruction.
4179
if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4180
return true;
4181
// PACI(A|B)SP instructions.
4182
if (Imm == 25 || Imm == 27)
4183
return true;
4184
return false;
4185
}
4186
default:
4187
return false;
4188
}
4189
}
4190
4191
bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4192
if (Reg == 0)
4193
return false;
4194
assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4195
return AArch64::FPR128RegClass.contains(Reg) ||
4196
AArch64::FPR64RegClass.contains(Reg) ||
4197
AArch64::FPR32RegClass.contains(Reg) ||
4198
AArch64::FPR16RegClass.contains(Reg) ||
4199
AArch64::FPR8RegClass.contains(Reg);
4200
}
4201
4202
bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4203
auto IsFPR = [&](const MachineOperand &Op) {
4204
if (!Op.isReg())
4205
return false;
4206
auto Reg = Op.getReg();
4207
if (Reg.isPhysical())
4208
return isFpOrNEON(Reg);
4209
4210
const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4211
return TRC == &AArch64::FPR128RegClass ||
4212
TRC == &AArch64::FPR128_loRegClass ||
4213
TRC == &AArch64::FPR64RegClass ||
4214
TRC == &AArch64::FPR64_loRegClass ||
4215
TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4216
TRC == &AArch64::FPR8RegClass;
4217
};
4218
return llvm::any_of(MI.operands(), IsFPR);
4219
}
4220
4221
// Scale the unscaled offsets. Returns false if the unscaled offset can't be
4222
// scaled.
4223
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4224
int Scale = AArch64InstrInfo::getMemScale(Opc);
4225
4226
// If the byte-offset isn't a multiple of the stride, we can't scale this
4227
// offset.
4228
if (Offset % Scale != 0)
4229
return false;
4230
4231
// Convert the byte-offset used by unscaled into an "element" offset used
4232
// by the scaled pair load/store instructions.
4233
Offset /= Scale;
4234
return true;
4235
}
4236
4237
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4238
if (FirstOpc == SecondOpc)
4239
return true;
4240
// We can also pair sign-ext and zero-ext instructions.
4241
switch (FirstOpc) {
4242
default:
4243
return false;
4244
case AArch64::STRSui:
4245
case AArch64::STURSi:
4246
return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4247
case AArch64::STRDui:
4248
case AArch64::STURDi:
4249
return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4250
case AArch64::STRQui:
4251
case AArch64::STURQi:
4252
return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4253
case AArch64::STRWui:
4254
case AArch64::STURWi:
4255
return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4256
case AArch64::STRXui:
4257
case AArch64::STURXi:
4258
return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4259
case AArch64::LDRSui:
4260
case AArch64::LDURSi:
4261
return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4262
case AArch64::LDRDui:
4263
case AArch64::LDURDi:
4264
return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4265
case AArch64::LDRQui:
4266
case AArch64::LDURQi:
4267
return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4268
case AArch64::LDRWui:
4269
case AArch64::LDURWi:
4270
return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4271
case AArch64::LDRSWui:
4272
case AArch64::LDURSWi:
4273
return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4274
case AArch64::LDRXui:
4275
case AArch64::LDURXi:
4276
return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4277
}
4278
// These instructions can't be paired based on their opcodes.
4279
return false;
4280
}
4281
4282
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4283
int64_t Offset1, unsigned Opcode1, int FI2,
4284
int64_t Offset2, unsigned Opcode2) {
4285
// Accesses through fixed stack object frame indices may access a different
4286
// fixed stack slot. Check that the object offsets + offsets match.
4287
if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4288
int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4289
int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4290
assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4291
// Convert to scaled object offsets.
4292
int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4293
if (ObjectOffset1 % Scale1 != 0)
4294
return false;
4295
ObjectOffset1 /= Scale1;
4296
int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4297
if (ObjectOffset2 % Scale2 != 0)
4298
return false;
4299
ObjectOffset2 /= Scale2;
4300
ObjectOffset1 += Offset1;
4301
ObjectOffset2 += Offset2;
4302
return ObjectOffset1 + 1 == ObjectOffset2;
4303
}
4304
4305
return FI1 == FI2;
4306
}
4307
4308
/// Detect opportunities for ldp/stp formation.
4309
///
4310
/// Only called for LdSt for which getMemOperandWithOffset returns true.
4311
bool AArch64InstrInfo::shouldClusterMemOps(
4312
ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4313
bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4314
int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4315
unsigned NumBytes) const {
4316
assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4317
const MachineOperand &BaseOp1 = *BaseOps1.front();
4318
const MachineOperand &BaseOp2 = *BaseOps2.front();
4319
const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4320
const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4321
if (BaseOp1.getType() != BaseOp2.getType())
4322
return false;
4323
4324
assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4325
"Only base registers and frame indices are supported.");
4326
4327
// Check for both base regs and base FI.
4328
if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4329
return false;
4330
4331
// Only cluster up to a single pair.
4332
if (ClusterSize > 2)
4333
return false;
4334
4335
if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4336
return false;
4337
4338
// Can we pair these instructions based on their opcodes?
4339
unsigned FirstOpc = FirstLdSt.getOpcode();
4340
unsigned SecondOpc = SecondLdSt.getOpcode();
4341
if (!canPairLdStOpc(FirstOpc, SecondOpc))
4342
return false;
4343
4344
// Can't merge volatiles or load/stores that have a hint to avoid pair
4345
// formation, for example.
4346
if (!isCandidateToMergeOrPair(FirstLdSt) ||
4347
!isCandidateToMergeOrPair(SecondLdSt))
4348
return false;
4349
4350
// isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4351
int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4352
if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4353
return false;
4354
4355
int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4356
if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4357
return false;
4358
4359
// Pairwise instructions have a 7-bit signed offset field.
4360
if (Offset1 > 63 || Offset1 < -64)
4361
return false;
4362
4363
// The caller should already have ordered First/SecondLdSt by offset.
4364
// Note: except for non-equal frame index bases
4365
if (BaseOp1.isFI()) {
4366
assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4367
"Caller should have ordered offsets.");
4368
4369
const MachineFrameInfo &MFI =
4370
FirstLdSt.getParent()->getParent()->getFrameInfo();
4371
return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4372
BaseOp2.getIndex(), Offset2, SecondOpc);
4373
}
4374
4375
assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4376
4377
return Offset1 + 1 == Offset2;
4378
}
4379
4380
static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4381
unsigned Reg, unsigned SubIdx,
4382
unsigned State,
4383
const TargetRegisterInfo *TRI) {
4384
if (!SubIdx)
4385
return MIB.addReg(Reg, State);
4386
4387
if (Register::isPhysicalRegister(Reg))
4388
return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4389
return MIB.addReg(Reg, State, SubIdx);
4390
}
4391
4392
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4393
unsigned NumRegs) {
4394
// We really want the positive remainder mod 32 here, that happens to be
4395
// easily obtainable with a mask.
4396
return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4397
}
4398
4399
void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4400
MachineBasicBlock::iterator I,
4401
const DebugLoc &DL, MCRegister DestReg,
4402
MCRegister SrcReg, bool KillSrc,
4403
unsigned Opcode,
4404
ArrayRef<unsigned> Indices) const {
4405
assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4406
const TargetRegisterInfo *TRI = &getRegisterInfo();
4407
uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4408
uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4409
unsigned NumRegs = Indices.size();
4410
4411
int SubReg = 0, End = NumRegs, Incr = 1;
4412
if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4413
SubReg = NumRegs - 1;
4414
End = -1;
4415
Incr = -1;
4416
}
4417
4418
for (; SubReg != End; SubReg += Incr) {
4419
const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4420
AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4421
AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4422
AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4423
}
4424
}
4425
4426
void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4427
MachineBasicBlock::iterator I,
4428
DebugLoc DL, unsigned DestReg,
4429
unsigned SrcReg, bool KillSrc,
4430
unsigned Opcode, unsigned ZeroReg,
4431
llvm::ArrayRef<unsigned> Indices) const {
4432
const TargetRegisterInfo *TRI = &getRegisterInfo();
4433
unsigned NumRegs = Indices.size();
4434
4435
#ifndef NDEBUG
4436
uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4437
uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4438
assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4439
"GPR reg sequences should not be able to overlap");
4440
#endif
4441
4442
for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4443
const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4444
AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4445
MIB.addReg(ZeroReg);
4446
AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4447
MIB.addImm(0);
4448
}
4449
}
4450
4451
void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4452
MachineBasicBlock::iterator I,
4453
const DebugLoc &DL, MCRegister DestReg,
4454
MCRegister SrcReg, bool KillSrc) const {
4455
if (AArch64::GPR32spRegClass.contains(DestReg) &&
4456
(AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4457
const TargetRegisterInfo *TRI = &getRegisterInfo();
4458
4459
if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4460
// If either operand is WSP, expand to ADD #0.
4461
if (Subtarget.hasZeroCycleRegMove()) {
4462
// Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4463
MCRegister DestRegX = TRI->getMatchingSuperReg(
4464
DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4465
MCRegister SrcRegX = TRI->getMatchingSuperReg(
4466
SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4467
// This instruction is reading and writing X registers. This may upset
4468
// the register scavenger and machine verifier, so we need to indicate
4469
// that we are reading an undefined value from SrcRegX, but a proper
4470
// value from SrcReg.
4471
BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4472
.addReg(SrcRegX, RegState::Undef)
4473
.addImm(0)
4474
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4475
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4476
} else {
4477
BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4478
.addReg(SrcReg, getKillRegState(KillSrc))
4479
.addImm(0)
4480
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4481
}
4482
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4483
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4484
.addImm(0)
4485
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4486
} else {
4487
if (Subtarget.hasZeroCycleRegMove()) {
4488
// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4489
MCRegister DestRegX = TRI->getMatchingSuperReg(
4490
DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4491
MCRegister SrcRegX = TRI->getMatchingSuperReg(
4492
SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4493
// This instruction is reading and writing X registers. This may upset
4494
// the register scavenger and machine verifier, so we need to indicate
4495
// that we are reading an undefined value from SrcRegX, but a proper
4496
// value from SrcReg.
4497
BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4498
.addReg(AArch64::XZR)
4499
.addReg(SrcRegX, RegState::Undef)
4500
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4501
} else {
4502
// Otherwise, expand to ORR WZR.
4503
BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4504
.addReg(AArch64::WZR)
4505
.addReg(SrcReg, getKillRegState(KillSrc));
4506
}
4507
}
4508
return;
4509
}
4510
4511
// Copy a Predicate register by ORRing with itself.
4512
if (AArch64::PPRRegClass.contains(DestReg) &&
4513
AArch64::PPRRegClass.contains(SrcReg)) {
4514
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4515
"Unexpected SVE register.");
4516
BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4517
.addReg(SrcReg) // Pg
4518
.addReg(SrcReg)
4519
.addReg(SrcReg, getKillRegState(KillSrc));
4520
return;
4521
}
4522
4523
// Copy a predicate-as-counter register by ORRing with itself as if it
4524
// were a regular predicate (mask) register.
4525
bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4526
bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4527
if (DestIsPNR || SrcIsPNR) {
4528
auto ToPPR = [](MCRegister R) -> MCRegister {
4529
return (R - AArch64::PN0) + AArch64::P0;
4530
};
4531
MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4532
MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4533
4534
if (PPRSrcReg != PPRDestReg) {
4535
auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4536
.addReg(PPRSrcReg) // Pg
4537
.addReg(PPRSrcReg)
4538
.addReg(PPRSrcReg, getKillRegState(KillSrc));
4539
if (DestIsPNR)
4540
NewMI.addDef(DestReg, RegState::Implicit);
4541
}
4542
return;
4543
}
4544
4545
// Copy a Z register by ORRing with itself.
4546
if (AArch64::ZPRRegClass.contains(DestReg) &&
4547
AArch64::ZPRRegClass.contains(SrcReg)) {
4548
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4549
"Unexpected SVE register.");
4550
BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4551
.addReg(SrcReg)
4552
.addReg(SrcReg, getKillRegState(KillSrc));
4553
return;
4554
}
4555
4556
// Copy a Z register pair by copying the individual sub-registers.
4557
if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4558
AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4559
(AArch64::ZPR2RegClass.contains(SrcReg) ||
4560
AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4561
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4562
"Unexpected SVE register.");
4563
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4564
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4565
Indices);
4566
return;
4567
}
4568
4569
// Copy a Z register triple by copying the individual sub-registers.
4570
if (AArch64::ZPR3RegClass.contains(DestReg) &&
4571
AArch64::ZPR3RegClass.contains(SrcReg)) {
4572
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4573
"Unexpected SVE register.");
4574
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4575
AArch64::zsub2};
4576
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4577
Indices);
4578
return;
4579
}
4580
4581
// Copy a Z register quad by copying the individual sub-registers.
4582
if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4583
AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4584
(AArch64::ZPR4RegClass.contains(SrcReg) ||
4585
AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4586
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4587
"Unexpected SVE register.");
4588
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4589
AArch64::zsub2, AArch64::zsub3};
4590
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4591
Indices);
4592
return;
4593
}
4594
4595
if (AArch64::GPR64spRegClass.contains(DestReg) &&
4596
(AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4597
if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4598
// If either operand is SP, expand to ADD #0.
4599
BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4600
.addReg(SrcReg, getKillRegState(KillSrc))
4601
.addImm(0)
4602
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4603
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4604
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4605
.addImm(0)
4606
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4607
} else {
4608
// Otherwise, expand to ORR XZR.
4609
BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4610
.addReg(AArch64::XZR)
4611
.addReg(SrcReg, getKillRegState(KillSrc));
4612
}
4613
return;
4614
}
4615
4616
// Copy a DDDD register quad by copying the individual sub-registers.
4617
if (AArch64::DDDDRegClass.contains(DestReg) &&
4618
AArch64::DDDDRegClass.contains(SrcReg)) {
4619
static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4620
AArch64::dsub2, AArch64::dsub3};
4621
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4622
Indices);
4623
return;
4624
}
4625
4626
// Copy a DDD register triple by copying the individual sub-registers.
4627
if (AArch64::DDDRegClass.contains(DestReg) &&
4628
AArch64::DDDRegClass.contains(SrcReg)) {
4629
static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4630
AArch64::dsub2};
4631
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4632
Indices);
4633
return;
4634
}
4635
4636
// Copy a DD register pair by copying the individual sub-registers.
4637
if (AArch64::DDRegClass.contains(DestReg) &&
4638
AArch64::DDRegClass.contains(SrcReg)) {
4639
static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4640
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4641
Indices);
4642
return;
4643
}
4644
4645
// Copy a QQQQ register quad by copying the individual sub-registers.
4646
if (AArch64::QQQQRegClass.contains(DestReg) &&
4647
AArch64::QQQQRegClass.contains(SrcReg)) {
4648
static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4649
AArch64::qsub2, AArch64::qsub3};
4650
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4651
Indices);
4652
return;
4653
}
4654
4655
// Copy a QQQ register triple by copying the individual sub-registers.
4656
if (AArch64::QQQRegClass.contains(DestReg) &&
4657
AArch64::QQQRegClass.contains(SrcReg)) {
4658
static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4659
AArch64::qsub2};
4660
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4661
Indices);
4662
return;
4663
}
4664
4665
// Copy a QQ register pair by copying the individual sub-registers.
4666
if (AArch64::QQRegClass.contains(DestReg) &&
4667
AArch64::QQRegClass.contains(SrcReg)) {
4668
static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4669
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4670
Indices);
4671
return;
4672
}
4673
4674
if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4675
AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4676
static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4677
copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4678
AArch64::XZR, Indices);
4679
return;
4680
}
4681
4682
if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4683
AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4684
static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4685
copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4686
AArch64::WZR, Indices);
4687
return;
4688
}
4689
4690
if (AArch64::FPR128RegClass.contains(DestReg) &&
4691
AArch64::FPR128RegClass.contains(SrcReg)) {
4692
if (Subtarget.isSVEorStreamingSVEAvailable() &&
4693
!Subtarget.isNeonAvailable())
4694
BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4695
.addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4696
.addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4697
.addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4698
else if (Subtarget.isNeonAvailable())
4699
BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4700
.addReg(SrcReg)
4701
.addReg(SrcReg, getKillRegState(KillSrc));
4702
else {
4703
BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4704
.addReg(AArch64::SP, RegState::Define)
4705
.addReg(SrcReg, getKillRegState(KillSrc))
4706
.addReg(AArch64::SP)
4707
.addImm(-16);
4708
BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
4709
.addReg(AArch64::SP, RegState::Define)
4710
.addReg(DestReg, RegState::Define)
4711
.addReg(AArch64::SP)
4712
.addImm(16);
4713
}
4714
return;
4715
}
4716
4717
if (AArch64::FPR64RegClass.contains(DestReg) &&
4718
AArch64::FPR64RegClass.contains(SrcReg)) {
4719
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4720
.addReg(SrcReg, getKillRegState(KillSrc));
4721
return;
4722
}
4723
4724
if (AArch64::FPR32RegClass.contains(DestReg) &&
4725
AArch64::FPR32RegClass.contains(SrcReg)) {
4726
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4727
.addReg(SrcReg, getKillRegState(KillSrc));
4728
return;
4729
}
4730
4731
if (AArch64::FPR16RegClass.contains(DestReg) &&
4732
AArch64::FPR16RegClass.contains(SrcReg)) {
4733
DestReg =
4734
RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4735
SrcReg =
4736
RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4737
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4738
.addReg(SrcReg, getKillRegState(KillSrc));
4739
return;
4740
}
4741
4742
if (AArch64::FPR8RegClass.contains(DestReg) &&
4743
AArch64::FPR8RegClass.contains(SrcReg)) {
4744
DestReg =
4745
RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4746
SrcReg =
4747
RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4748
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4749
.addReg(SrcReg, getKillRegState(KillSrc));
4750
return;
4751
}
4752
4753
// Copies between GPR64 and FPR64.
4754
if (AArch64::FPR64RegClass.contains(DestReg) &&
4755
AArch64::GPR64RegClass.contains(SrcReg)) {
4756
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4757
.addReg(SrcReg, getKillRegState(KillSrc));
4758
return;
4759
}
4760
if (AArch64::GPR64RegClass.contains(DestReg) &&
4761
AArch64::FPR64RegClass.contains(SrcReg)) {
4762
BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4763
.addReg(SrcReg, getKillRegState(KillSrc));
4764
return;
4765
}
4766
// Copies between GPR32 and FPR32.
4767
if (AArch64::FPR32RegClass.contains(DestReg) &&
4768
AArch64::GPR32RegClass.contains(SrcReg)) {
4769
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4770
.addReg(SrcReg, getKillRegState(KillSrc));
4771
return;
4772
}
4773
if (AArch64::GPR32RegClass.contains(DestReg) &&
4774
AArch64::FPR32RegClass.contains(SrcReg)) {
4775
BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4776
.addReg(SrcReg, getKillRegState(KillSrc));
4777
return;
4778
}
4779
4780
if (DestReg == AArch64::NZCV) {
4781
assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4782
BuildMI(MBB, I, DL, get(AArch64::MSR))
4783
.addImm(AArch64SysReg::NZCV)
4784
.addReg(SrcReg, getKillRegState(KillSrc))
4785
.addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4786
return;
4787
}
4788
4789
if (SrcReg == AArch64::NZCV) {
4790
assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4791
BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4792
.addImm(AArch64SysReg::NZCV)
4793
.addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4794
return;
4795
}
4796
4797
#ifndef NDEBUG
4798
const TargetRegisterInfo &TRI = getRegisterInfo();
4799
errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4800
<< TRI.getRegAsmName(SrcReg) << "\n";
4801
#endif
4802
llvm_unreachable("unimplemented reg-to-reg copy");
4803
}
4804
4805
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4806
MachineBasicBlock &MBB,
4807
MachineBasicBlock::iterator InsertBefore,
4808
const MCInstrDesc &MCID,
4809
Register SrcReg, bool IsKill,
4810
unsigned SubIdx0, unsigned SubIdx1, int FI,
4811
MachineMemOperand *MMO) {
4812
Register SrcReg0 = SrcReg;
4813
Register SrcReg1 = SrcReg;
4814
if (SrcReg.isPhysical()) {
4815
SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
4816
SubIdx0 = 0;
4817
SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
4818
SubIdx1 = 0;
4819
}
4820
BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4821
.addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
4822
.addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
4823
.addFrameIndex(FI)
4824
.addImm(0)
4825
.addMemOperand(MMO);
4826
}
4827
4828
void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4829
MachineBasicBlock::iterator MBBI,
4830
Register SrcReg, bool isKill, int FI,
4831
const TargetRegisterClass *RC,
4832
const TargetRegisterInfo *TRI,
4833
Register VReg) const {
4834
MachineFunction &MF = *MBB.getParent();
4835
MachineFrameInfo &MFI = MF.getFrameInfo();
4836
4837
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4838
MachineMemOperand *MMO =
4839
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
4840
MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4841
unsigned Opc = 0;
4842
bool Offset = true;
4843
MCRegister PNRReg = MCRegister::NoRegister;
4844
unsigned StackID = TargetStackID::Default;
4845
switch (TRI->getSpillSize(*RC)) {
4846
case 1:
4847
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4848
Opc = AArch64::STRBui;
4849
break;
4850
case 2: {
4851
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4852
Opc = AArch64::STRHui;
4853
else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
4854
AArch64::PPRRegClass.hasSubClassEq(RC)) {
4855
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4856
"Unexpected register store without SVE store instructions");
4857
Opc = AArch64::STR_PXI;
4858
StackID = TargetStackID::ScalableVector;
4859
}
4860
break;
4861
}
4862
case 4:
4863
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4864
Opc = AArch64::STRWui;
4865
if (SrcReg.isVirtual())
4866
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4867
else
4868
assert(SrcReg != AArch64::WSP);
4869
} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4870
Opc = AArch64::STRSui;
4871
else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4872
Opc = AArch64::STR_PPXI;
4873
StackID = TargetStackID::ScalableVector;
4874
}
4875
break;
4876
case 8:
4877
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4878
Opc = AArch64::STRXui;
4879
if (SrcReg.isVirtual())
4880
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4881
else
4882
assert(SrcReg != AArch64::SP);
4883
} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4884
Opc = AArch64::STRDui;
4885
} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4886
storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4887
get(AArch64::STPWi), SrcReg, isKill,
4888
AArch64::sube32, AArch64::subo32, FI, MMO);
4889
return;
4890
}
4891
break;
4892
case 16:
4893
if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4894
Opc = AArch64::STRQui;
4895
else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4896
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4897
Opc = AArch64::ST1Twov1d;
4898
Offset = false;
4899
} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4900
storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4901
get(AArch64::STPXi), SrcReg, isKill,
4902
AArch64::sube64, AArch64::subo64, FI, MMO);
4903
return;
4904
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4905
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4906
"Unexpected register store without SVE store instructions");
4907
Opc = AArch64::STR_ZXI;
4908
StackID = TargetStackID::ScalableVector;
4909
}
4910
break;
4911
case 24:
4912
if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4913
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4914
Opc = AArch64::ST1Threev1d;
4915
Offset = false;
4916
}
4917
break;
4918
case 32:
4919
if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4920
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4921
Opc = AArch64::ST1Fourv1d;
4922
Offset = false;
4923
} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4924
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4925
Opc = AArch64::ST1Twov2d;
4926
Offset = false;
4927
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4928
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4929
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4930
"Unexpected register store without SVE store instructions");
4931
Opc = AArch64::STR_ZZXI;
4932
StackID = TargetStackID::ScalableVector;
4933
}
4934
break;
4935
case 48:
4936
if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4937
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4938
Opc = AArch64::ST1Threev2d;
4939
Offset = false;
4940
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4941
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4942
"Unexpected register store without SVE store instructions");
4943
Opc = AArch64::STR_ZZZXI;
4944
StackID = TargetStackID::ScalableVector;
4945
}
4946
break;
4947
case 64:
4948
if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4949
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4950
Opc = AArch64::ST1Fourv2d;
4951
Offset = false;
4952
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4953
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4954
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4955
"Unexpected register store without SVE store instructions");
4956
Opc = AArch64::STR_ZZZZXI;
4957
StackID = TargetStackID::ScalableVector;
4958
}
4959
break;
4960
}
4961
assert(Opc && "Unknown register class");
4962
MFI.setStackID(FI, StackID);
4963
4964
const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4965
.addReg(SrcReg, getKillRegState(isKill))
4966
.addFrameIndex(FI);
4967
4968
if (Offset)
4969
MI.addImm(0);
4970
if (PNRReg.isValid())
4971
MI.addDef(PNRReg, RegState::Implicit);
4972
MI.addMemOperand(MMO);
4973
}
4974
4975
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4976
MachineBasicBlock &MBB,
4977
MachineBasicBlock::iterator InsertBefore,
4978
const MCInstrDesc &MCID,
4979
Register DestReg, unsigned SubIdx0,
4980
unsigned SubIdx1, int FI,
4981
MachineMemOperand *MMO) {
4982
Register DestReg0 = DestReg;
4983
Register DestReg1 = DestReg;
4984
bool IsUndef = true;
4985
if (DestReg.isPhysical()) {
4986
DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
4987
SubIdx0 = 0;
4988
DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
4989
SubIdx1 = 0;
4990
IsUndef = false;
4991
}
4992
BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4993
.addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
4994
.addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
4995
.addFrameIndex(FI)
4996
.addImm(0)
4997
.addMemOperand(MMO);
4998
}
4999
5000
void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
5001
MachineBasicBlock::iterator MBBI,
5002
Register DestReg, int FI,
5003
const TargetRegisterClass *RC,
5004
const TargetRegisterInfo *TRI,
5005
Register VReg) const {
5006
MachineFunction &MF = *MBB.getParent();
5007
MachineFrameInfo &MFI = MF.getFrameInfo();
5008
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5009
MachineMemOperand *MMO =
5010
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5011
MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5012
5013
unsigned Opc = 0;
5014
bool Offset = true;
5015
unsigned StackID = TargetStackID::Default;
5016
Register PNRReg = MCRegister::NoRegister;
5017
switch (TRI->getSpillSize(*RC)) {
5018
case 1:
5019
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5020
Opc = AArch64::LDRBui;
5021
break;
5022
case 2: {
5023
bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5024
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5025
Opc = AArch64::LDRHui;
5026
else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5027
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5028
"Unexpected register load without SVE load instructions");
5029
if (IsPNR)
5030
PNRReg = DestReg;
5031
Opc = AArch64::LDR_PXI;
5032
StackID = TargetStackID::ScalableVector;
5033
}
5034
break;
5035
}
5036
case 4:
5037
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5038
Opc = AArch64::LDRWui;
5039
if (DestReg.isVirtual())
5040
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5041
else
5042
assert(DestReg != AArch64::WSP);
5043
} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5044
Opc = AArch64::LDRSui;
5045
else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5046
Opc = AArch64::LDR_PPXI;
5047
StackID = TargetStackID::ScalableVector;
5048
}
5049
break;
5050
case 8:
5051
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5052
Opc = AArch64::LDRXui;
5053
if (DestReg.isVirtual())
5054
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5055
else
5056
assert(DestReg != AArch64::SP);
5057
} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5058
Opc = AArch64::LDRDui;
5059
} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5060
loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5061
get(AArch64::LDPWi), DestReg, AArch64::sube32,
5062
AArch64::subo32, FI, MMO);
5063
return;
5064
}
5065
break;
5066
case 16:
5067
if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5068
Opc = AArch64::LDRQui;
5069
else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5070
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5071
Opc = AArch64::LD1Twov1d;
5072
Offset = false;
5073
} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5074
loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5075
get(AArch64::LDPXi), DestReg, AArch64::sube64,
5076
AArch64::subo64, FI, MMO);
5077
return;
5078
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5079
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5080
"Unexpected register load without SVE load instructions");
5081
Opc = AArch64::LDR_ZXI;
5082
StackID = TargetStackID::ScalableVector;
5083
}
5084
break;
5085
case 24:
5086
if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5087
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5088
Opc = AArch64::LD1Threev1d;
5089
Offset = false;
5090
}
5091
break;
5092
case 32:
5093
if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5094
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5095
Opc = AArch64::LD1Fourv1d;
5096
Offset = false;
5097
} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5098
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5099
Opc = AArch64::LD1Twov2d;
5100
Offset = false;
5101
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5102
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5103
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5104
"Unexpected register load without SVE load instructions");
5105
Opc = AArch64::LDR_ZZXI;
5106
StackID = TargetStackID::ScalableVector;
5107
}
5108
break;
5109
case 48:
5110
if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5111
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5112
Opc = AArch64::LD1Threev2d;
5113
Offset = false;
5114
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5115
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5116
"Unexpected register load without SVE load instructions");
5117
Opc = AArch64::LDR_ZZZXI;
5118
StackID = TargetStackID::ScalableVector;
5119
}
5120
break;
5121
case 64:
5122
if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5123
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5124
Opc = AArch64::LD1Fourv2d;
5125
Offset = false;
5126
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5127
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5128
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5129
"Unexpected register load without SVE load instructions");
5130
Opc = AArch64::LDR_ZZZZXI;
5131
StackID = TargetStackID::ScalableVector;
5132
}
5133
break;
5134
}
5135
5136
assert(Opc && "Unknown register class");
5137
MFI.setStackID(FI, StackID);
5138
5139
const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5140
.addReg(DestReg, getDefRegState(true))
5141
.addFrameIndex(FI);
5142
if (Offset)
5143
MI.addImm(0);
5144
if (PNRReg.isValid() && !PNRReg.isVirtual())
5145
MI.addDef(PNRReg, RegState::Implicit);
5146
MI.addMemOperand(MMO);
5147
}
5148
5149
bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5150
const MachineInstr &UseMI,
5151
const TargetRegisterInfo *TRI) {
5152
return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5153
UseMI.getIterator()),
5154
[TRI](const MachineInstr &I) {
5155
return I.modifiesRegister(AArch64::NZCV, TRI) ||
5156
I.readsRegister(AArch64::NZCV, TRI);
5157
});
5158
}
5159
5160
void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5161
const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5162
// The smallest scalable element supported by scaled SVE addressing
5163
// modes are predicates, which are 2 scalable bytes in size. So the scalable
5164
// byte offset must always be a multiple of 2.
5165
assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5166
5167
// VGSized offsets are divided by '2', because the VG register is the
5168
// the number of 64bit granules as opposed to 128bit vector chunks,
5169
// which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5170
// So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5171
// VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5172
ByteSized = Offset.getFixed();
5173
VGSized = Offset.getScalable() / 2;
5174
}
5175
5176
/// Returns the offset in parts to which this frame offset can be
5177
/// decomposed for the purpose of describing a frame offset.
5178
/// For non-scalable offsets this is simply its byte size.
5179
void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5180
const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5181
int64_t &NumDataVectors) {
5182
// The smallest scalable element supported by scaled SVE addressing
5183
// modes are predicates, which are 2 scalable bytes in size. So the scalable
5184
// byte offset must always be a multiple of 2.
5185
assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5186
5187
NumBytes = Offset.getFixed();
5188
NumDataVectors = 0;
5189
NumPredicateVectors = Offset.getScalable() / 2;
5190
// This method is used to get the offsets to adjust the frame offset.
5191
// If the function requires ADDPL to be used and needs more than two ADDPL
5192
// instructions, part of the offset is folded into NumDataVectors so that it
5193
// uses ADDVL for part of it, reducing the number of ADDPL instructions.
5194
if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5195
NumPredicateVectors > 62) {
5196
NumDataVectors = NumPredicateVectors / 8;
5197
NumPredicateVectors -= NumDataVectors * 8;
5198
}
5199
}
5200
5201
// Convenience function to create a DWARF expression for
5202
// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
5203
static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5204
int NumVGScaledBytes, unsigned VG,
5205
llvm::raw_string_ostream &Comment) {
5206
uint8_t buffer[16];
5207
5208
if (NumBytes) {
5209
Expr.push_back(dwarf::DW_OP_consts);
5210
Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5211
Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5212
Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5213
}
5214
5215
if (NumVGScaledBytes) {
5216
Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5217
Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5218
5219
Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5220
Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5221
Expr.push_back(0);
5222
5223
Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5224
Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5225
5226
Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5227
<< std::abs(NumVGScaledBytes) << " * VG";
5228
}
5229
}
5230
5231
// Creates an MCCFIInstruction:
5232
// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
5233
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5234
unsigned Reg,
5235
const StackOffset &Offset) {
5236
int64_t NumBytes, NumVGScaledBytes;
5237
AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5238
NumVGScaledBytes);
5239
std::string CommentBuffer;
5240
llvm::raw_string_ostream Comment(CommentBuffer);
5241
5242
if (Reg == AArch64::SP)
5243
Comment << "sp";
5244
else if (Reg == AArch64::FP)
5245
Comment << "fp";
5246
else
5247
Comment << printReg(Reg, &TRI);
5248
5249
// Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5250
SmallString<64> Expr;
5251
unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5252
Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5253
Expr.push_back(0);
5254
appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5255
TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5256
5257
// Wrap this into DW_CFA_def_cfa.
5258
SmallString<64> DefCfaExpr;
5259
DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5260
uint8_t buffer[16];
5261
DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5262
DefCfaExpr.append(Expr.str());
5263
return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5264
Comment.str());
5265
}
5266
5267
MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5268
unsigned FrameReg, unsigned Reg,
5269
const StackOffset &Offset,
5270
bool LastAdjustmentWasScalable) {
5271
if (Offset.getScalable())
5272
return createDefCFAExpression(TRI, Reg, Offset);
5273
5274
if (FrameReg == Reg && !LastAdjustmentWasScalable)
5275
return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5276
5277
unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5278
return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5279
}
5280
5281
MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5282
unsigned Reg,
5283
const StackOffset &OffsetFromDefCFA) {
5284
int64_t NumBytes, NumVGScaledBytes;
5285
AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5286
OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5287
5288
unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5289
5290
// Non-scalable offsets can use DW_CFA_offset directly.
5291
if (!NumVGScaledBytes)
5292
return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5293
5294
std::string CommentBuffer;
5295
llvm::raw_string_ostream Comment(CommentBuffer);
5296
Comment << printReg(Reg, &TRI) << " @ cfa";
5297
5298
// Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5299
SmallString<64> OffsetExpr;
5300
appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5301
TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5302
5303
// Wrap this into DW_CFA_expression
5304
SmallString<64> CfaExpr;
5305
CfaExpr.push_back(dwarf::DW_CFA_expression);
5306
uint8_t buffer[16];
5307
CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5308
CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5309
CfaExpr.append(OffsetExpr.str());
5310
5311
return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5312
Comment.str());
5313
}
5314
5315
// Helper function to emit a frame offset adjustment from a given
5316
// pointer (SrcReg), stored into DestReg. This function is explicit
5317
// in that it requires the opcode.
5318
static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5319
MachineBasicBlock::iterator MBBI,
5320
const DebugLoc &DL, unsigned DestReg,
5321
unsigned SrcReg, int64_t Offset, unsigned Opc,
5322
const TargetInstrInfo *TII,
5323
MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5324
bool *HasWinCFI, bool EmitCFAOffset,
5325
StackOffset CFAOffset, unsigned FrameReg) {
5326
int Sign = 1;
5327
unsigned MaxEncoding, ShiftSize;
5328
switch (Opc) {
5329
case AArch64::ADDXri:
5330
case AArch64::ADDSXri:
5331
case AArch64::SUBXri:
5332
case AArch64::SUBSXri:
5333
MaxEncoding = 0xfff;
5334
ShiftSize = 12;
5335
break;
5336
case AArch64::ADDVL_XXI:
5337
case AArch64::ADDPL_XXI:
5338
case AArch64::ADDSVL_XXI:
5339
case AArch64::ADDSPL_XXI:
5340
MaxEncoding = 31;
5341
ShiftSize = 0;
5342
if (Offset < 0) {
5343
MaxEncoding = 32;
5344
Sign = -1;
5345
Offset = -Offset;
5346
}
5347
break;
5348
default:
5349
llvm_unreachable("Unsupported opcode");
5350
}
5351
5352
// `Offset` can be in bytes or in "scalable bytes".
5353
int VScale = 1;
5354
if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5355
VScale = 16;
5356
else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5357
VScale = 2;
5358
5359
// FIXME: If the offset won't fit in 24-bits, compute the offset into a
5360
// scratch register. If DestReg is a virtual register, use it as the
5361
// scratch register; otherwise, create a new virtual register (to be
5362
// replaced by the scavenger at the end of PEI). That case can be optimized
5363
// slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5364
// register can be loaded with offset%8 and the add/sub can use an extending
5365
// instruction with LSL#3.
5366
// Currently the function handles any offsets but generates a poor sequence
5367
// of code.
5368
// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5369
5370
const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5371
Register TmpReg = DestReg;
5372
if (TmpReg == AArch64::XZR)
5373
TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5374
&AArch64::GPR64RegClass);
5375
do {
5376
uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5377
unsigned LocalShiftSize = 0;
5378
if (ThisVal > MaxEncoding) {
5379
ThisVal = ThisVal >> ShiftSize;
5380
LocalShiftSize = ShiftSize;
5381
}
5382
assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5383
"Encoding cannot handle value that big");
5384
5385
Offset -= ThisVal << LocalShiftSize;
5386
if (Offset == 0)
5387
TmpReg = DestReg;
5388
auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5389
.addReg(SrcReg)
5390
.addImm(Sign * (int)ThisVal);
5391
if (ShiftSize)
5392
MBI = MBI.addImm(
5393
AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5394
MBI = MBI.setMIFlag(Flag);
5395
5396
auto Change =
5397
VScale == 1
5398
? StackOffset::getFixed(ThisVal << LocalShiftSize)
5399
: StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5400
if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5401
CFAOffset += Change;
5402
else
5403
CFAOffset -= Change;
5404
if (EmitCFAOffset && DestReg == TmpReg) {
5405
MachineFunction &MF = *MBB.getParent();
5406
const TargetSubtargetInfo &STI = MF.getSubtarget();
5407
const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5408
5409
unsigned CFIIndex = MF.addFrameInst(
5410
createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5411
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5412
.addCFIIndex(CFIIndex)
5413
.setMIFlags(Flag);
5414
}
5415
5416
if (NeedsWinCFI) {
5417
assert(Sign == 1 && "SEH directives should always have a positive sign");
5418
int Imm = (int)(ThisVal << LocalShiftSize);
5419
if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5420
(SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5421
if (HasWinCFI)
5422
*HasWinCFI = true;
5423
if (Imm == 0)
5424
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5425
else
5426
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5427
.addImm(Imm)
5428
.setMIFlag(Flag);
5429
assert(Offset == 0 && "Expected remaining offset to be zero to "
5430
"emit a single SEH directive");
5431
} else if (DestReg == AArch64::SP) {
5432
if (HasWinCFI)
5433
*HasWinCFI = true;
5434
assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5435
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5436
.addImm(Imm)
5437
.setMIFlag(Flag);
5438
}
5439
}
5440
5441
SrcReg = TmpReg;
5442
} while (Offset);
5443
}
5444
5445
void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5446
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5447
unsigned DestReg, unsigned SrcReg,
5448
StackOffset Offset, const TargetInstrInfo *TII,
5449
MachineInstr::MIFlag Flag, bool SetNZCV,
5450
bool NeedsWinCFI, bool *HasWinCFI,
5451
bool EmitCFAOffset, StackOffset CFAOffset,
5452
unsigned FrameReg) {
5453
// If a function is marked as arm_locally_streaming, then the runtime value of
5454
// vscale in the prologue/epilogue is different the runtime value of vscale
5455
// in the function's body. To avoid having to consider multiple vscales,
5456
// we can use `addsvl` to allocate any scalable stack-slots, which under
5457
// most circumstances will be only locals, not callee-save slots.
5458
const Function &F = MBB.getParent()->getFunction();
5459
bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5460
5461
int64_t Bytes, NumPredicateVectors, NumDataVectors;
5462
AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5463
Offset, Bytes, NumPredicateVectors, NumDataVectors);
5464
5465
// First emit non-scalable frame offsets, or a simple 'mov'.
5466
if (Bytes || (!Offset && SrcReg != DestReg)) {
5467
assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5468
"SP increment/decrement not 8-byte aligned");
5469
unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5470
if (Bytes < 0) {
5471
Bytes = -Bytes;
5472
Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5473
}
5474
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5475
NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5476
FrameReg);
5477
CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5478
? StackOffset::getFixed(-Bytes)
5479
: StackOffset::getFixed(Bytes);
5480
SrcReg = DestReg;
5481
FrameReg = DestReg;
5482
}
5483
5484
assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5485
"SetNZCV not supported with SVE vectors");
5486
assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5487
"WinCFI not supported with SVE vectors");
5488
5489
if (NumDataVectors) {
5490
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5491
UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5492
TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5493
CFAOffset, FrameReg);
5494
CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5495
SrcReg = DestReg;
5496
}
5497
5498
if (NumPredicateVectors) {
5499
assert(DestReg != AArch64::SP && "Unaligned access to SP");
5500
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5501
UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5502
TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5503
CFAOffset, FrameReg);
5504
}
5505
}
5506
5507
MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5508
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5509
MachineBasicBlock::iterator InsertPt, int FrameIndex,
5510
LiveIntervals *LIS, VirtRegMap *VRM) const {
5511
// This is a bit of a hack. Consider this instruction:
5512
//
5513
// %0 = COPY %sp; GPR64all:%0
5514
//
5515
// We explicitly chose GPR64all for the virtual register so such a copy might
5516
// be eliminated by RegisterCoalescer. However, that may not be possible, and
5517
// %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5518
// register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5519
//
5520
// To prevent that, we are going to constrain the %0 register class here.
5521
if (MI.isFullCopy()) {
5522
Register DstReg = MI.getOperand(0).getReg();
5523
Register SrcReg = MI.getOperand(1).getReg();
5524
if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5525
MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5526
return nullptr;
5527
}
5528
if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5529
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5530
return nullptr;
5531
}
5532
// Nothing can folded with copy from/to NZCV.
5533
if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5534
return nullptr;
5535
}
5536
5537
// Handle the case where a copy is being spilled or filled but the source
5538
// and destination register class don't match. For example:
5539
//
5540
// %0 = COPY %xzr; GPR64common:%0
5541
//
5542
// In this case we can still safely fold away the COPY and generate the
5543
// following spill code:
5544
//
5545
// STRXui %xzr, %stack.0
5546
//
5547
// This also eliminates spilled cross register class COPYs (e.g. between x and
5548
// d regs) of the same size. For example:
5549
//
5550
// %0 = COPY %1; GPR64:%0, FPR64:%1
5551
//
5552
// will be filled as
5553
//
5554
// LDRDui %0, fi<#0>
5555
//
5556
// instead of
5557
//
5558
// LDRXui %Temp, fi<#0>
5559
// %0 = FMOV %Temp
5560
//
5561
if (MI.isCopy() && Ops.size() == 1 &&
5562
// Make sure we're only folding the explicit COPY defs/uses.
5563
(Ops[0] == 0 || Ops[0] == 1)) {
5564
bool IsSpill = Ops[0] == 0;
5565
bool IsFill = !IsSpill;
5566
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5567
const MachineRegisterInfo &MRI = MF.getRegInfo();
5568
MachineBasicBlock &MBB = *MI.getParent();
5569
const MachineOperand &DstMO = MI.getOperand(0);
5570
const MachineOperand &SrcMO = MI.getOperand(1);
5571
Register DstReg = DstMO.getReg();
5572
Register SrcReg = SrcMO.getReg();
5573
// This is slightly expensive to compute for physical regs since
5574
// getMinimalPhysRegClass is slow.
5575
auto getRegClass = [&](unsigned Reg) {
5576
return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5577
: TRI.getMinimalPhysRegClass(Reg);
5578
};
5579
5580
if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5581
assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5582
TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5583
"Mismatched register size in non subreg COPY");
5584
if (IsSpill)
5585
storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
5586
getRegClass(SrcReg), &TRI, Register());
5587
else
5588
loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
5589
getRegClass(DstReg), &TRI, Register());
5590
return &*--InsertPt;
5591
}
5592
5593
// Handle cases like spilling def of:
5594
//
5595
// %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5596
//
5597
// where the physical register source can be widened and stored to the full
5598
// virtual reg destination stack slot, in this case producing:
5599
//
5600
// STRXui %xzr, %stack.0
5601
//
5602
if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5603
TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5604
assert(SrcMO.getSubReg() == 0 &&
5605
"Unexpected subreg on physical register");
5606
storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5607
FrameIndex, &AArch64::GPR64RegClass, &TRI,
5608
Register());
5609
return &*--InsertPt;
5610
}
5611
5612
// Handle cases like filling use of:
5613
//
5614
// %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5615
//
5616
// where we can load the full virtual reg source stack slot, into the subreg
5617
// destination, in this case producing:
5618
//
5619
// LDRWui %0:sub_32<def,read-undef>, %stack.0
5620
//
5621
if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5622
const TargetRegisterClass *FillRC;
5623
switch (DstMO.getSubReg()) {
5624
default:
5625
FillRC = nullptr;
5626
break;
5627
case AArch64::sub_32:
5628
FillRC = &AArch64::GPR32RegClass;
5629
break;
5630
case AArch64::ssub:
5631
FillRC = &AArch64::FPR32RegClass;
5632
break;
5633
case AArch64::dsub:
5634
FillRC = &AArch64::FPR64RegClass;
5635
break;
5636
}
5637
5638
if (FillRC) {
5639
assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5640
TRI.getRegSizeInBits(*FillRC) &&
5641
"Mismatched regclass size on folded subreg COPY");
5642
loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
5643
Register());
5644
MachineInstr &LoadMI = *--InsertPt;
5645
MachineOperand &LoadDst = LoadMI.getOperand(0);
5646
assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5647
LoadDst.setSubReg(DstMO.getSubReg());
5648
LoadDst.setIsUndef();
5649
return &LoadMI;
5650
}
5651
}
5652
}
5653
5654
// Cannot fold.
5655
return nullptr;
5656
}
5657
5658
int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5659
StackOffset &SOffset,
5660
bool *OutUseUnscaledOp,
5661
unsigned *OutUnscaledOp,
5662
int64_t *EmittableOffset) {
5663
// Set output values in case of early exit.
5664
if (EmittableOffset)
5665
*EmittableOffset = 0;
5666
if (OutUseUnscaledOp)
5667
*OutUseUnscaledOp = false;
5668
if (OutUnscaledOp)
5669
*OutUnscaledOp = 0;
5670
5671
// Exit early for structured vector spills/fills as they can't take an
5672
// immediate offset.
5673
switch (MI.getOpcode()) {
5674
default:
5675
break;
5676
case AArch64::LD1Rv1d:
5677
case AArch64::LD1Rv2s:
5678
case AArch64::LD1Rv2d:
5679
case AArch64::LD1Rv4h:
5680
case AArch64::LD1Rv4s:
5681
case AArch64::LD1Rv8b:
5682
case AArch64::LD1Rv8h:
5683
case AArch64::LD1Rv16b:
5684
case AArch64::LD1Twov2d:
5685
case AArch64::LD1Threev2d:
5686
case AArch64::LD1Fourv2d:
5687
case AArch64::LD1Twov1d:
5688
case AArch64::LD1Threev1d:
5689
case AArch64::LD1Fourv1d:
5690
case AArch64::ST1Twov2d:
5691
case AArch64::ST1Threev2d:
5692
case AArch64::ST1Fourv2d:
5693
case AArch64::ST1Twov1d:
5694
case AArch64::ST1Threev1d:
5695
case AArch64::ST1Fourv1d:
5696
case AArch64::ST1i8:
5697
case AArch64::ST1i16:
5698
case AArch64::ST1i32:
5699
case AArch64::ST1i64:
5700
case AArch64::IRG:
5701
case AArch64::IRGstack:
5702
case AArch64::STGloop:
5703
case AArch64::STZGloop:
5704
return AArch64FrameOffsetCannotUpdate;
5705
}
5706
5707
// Get the min/max offset and the scale.
5708
TypeSize ScaleValue(0U, false), Width(0U, false);
5709
int64_t MinOff, MaxOff;
5710
if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
5711
MaxOff))
5712
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5713
5714
// Construct the complete offset.
5715
bool IsMulVL = ScaleValue.isScalable();
5716
unsigned Scale = ScaleValue.getKnownMinValue();
5717
int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5718
5719
const MachineOperand &ImmOpnd =
5720
MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
5721
Offset += ImmOpnd.getImm() * Scale;
5722
5723
// If the offset doesn't match the scale, we rewrite the instruction to
5724
// use the unscaled instruction instead. Likewise, if we have a negative
5725
// offset and there is an unscaled op to use.
5726
std::optional<unsigned> UnscaledOp =
5727
AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
5728
bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5729
if (useUnscaledOp &&
5730
!AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
5731
MaxOff))
5732
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5733
5734
Scale = ScaleValue.getKnownMinValue();
5735
assert(IsMulVL == ScaleValue.isScalable() &&
5736
"Unscaled opcode has different value for scalable");
5737
5738
int64_t Remainder = Offset % Scale;
5739
assert(!(Remainder && useUnscaledOp) &&
5740
"Cannot have remainder when using unscaled op");
5741
5742
assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5743
int64_t NewOffset = Offset / Scale;
5744
if (MinOff <= NewOffset && NewOffset <= MaxOff)
5745
Offset = Remainder;
5746
else {
5747
NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5748
Offset = Offset - (NewOffset * Scale);
5749
}
5750
5751
if (EmittableOffset)
5752
*EmittableOffset = NewOffset;
5753
if (OutUseUnscaledOp)
5754
*OutUseUnscaledOp = useUnscaledOp;
5755
if (OutUnscaledOp && UnscaledOp)
5756
*OutUnscaledOp = *UnscaledOp;
5757
5758
if (IsMulVL)
5759
SOffset = StackOffset::get(SOffset.getFixed(), Offset);
5760
else
5761
SOffset = StackOffset::get(Offset, SOffset.getScalable());
5762
return AArch64FrameOffsetCanUpdate |
5763
(SOffset ? 0 : AArch64FrameOffsetIsLegal);
5764
}
5765
5766
bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5767
unsigned FrameReg, StackOffset &Offset,
5768
const AArch64InstrInfo *TII) {
5769
unsigned Opcode = MI.getOpcode();
5770
unsigned ImmIdx = FrameRegIdx + 1;
5771
5772
if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5773
Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
5774
emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5775
MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5776
MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5777
MI.eraseFromParent();
5778
Offset = StackOffset();
5779
return true;
5780
}
5781
5782
int64_t NewOffset;
5783
unsigned UnscaledOp;
5784
bool UseUnscaledOp;
5785
int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
5786
&UnscaledOp, &NewOffset);
5787
if (Status & AArch64FrameOffsetCanUpdate) {
5788
if (Status & AArch64FrameOffsetIsLegal)
5789
// Replace the FrameIndex with FrameReg.
5790
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
5791
if (UseUnscaledOp)
5792
MI.setDesc(TII->get(UnscaledOp));
5793
5794
MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
5795
return !Offset;
5796
}
5797
5798
return false;
5799
}
5800
5801
void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5802
MachineBasicBlock::iterator MI) const {
5803
DebugLoc DL;
5804
BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5805
}
5806
5807
MCInst AArch64InstrInfo::getNop() const {
5808
return MCInstBuilder(AArch64::HINT).addImm(0);
5809
}
5810
5811
// AArch64 supports MachineCombiner.
5812
bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5813
5814
// True when Opc sets flag
5815
static bool isCombineInstrSettingFlag(unsigned Opc) {
5816
switch (Opc) {
5817
case AArch64::ADDSWrr:
5818
case AArch64::ADDSWri:
5819
case AArch64::ADDSXrr:
5820
case AArch64::ADDSXri:
5821
case AArch64::SUBSWrr:
5822
case AArch64::SUBSXrr:
5823
// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5824
case AArch64::SUBSWri:
5825
case AArch64::SUBSXri:
5826
return true;
5827
default:
5828
break;
5829
}
5830
return false;
5831
}
5832
5833
// 32b Opcodes that can be combined with a MUL
5834
static bool isCombineInstrCandidate32(unsigned Opc) {
5835
switch (Opc) {
5836
case AArch64::ADDWrr:
5837
case AArch64::ADDWri:
5838
case AArch64::SUBWrr:
5839
case AArch64::ADDSWrr:
5840
case AArch64::ADDSWri:
5841
case AArch64::SUBSWrr:
5842
// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5843
case AArch64::SUBWri:
5844
case AArch64::SUBSWri:
5845
return true;
5846
default:
5847
break;
5848
}
5849
return false;
5850
}
5851
5852
// 64b Opcodes that can be combined with a MUL
5853
static bool isCombineInstrCandidate64(unsigned Opc) {
5854
switch (Opc) {
5855
case AArch64::ADDXrr:
5856
case AArch64::ADDXri:
5857
case AArch64::SUBXrr:
5858
case AArch64::ADDSXrr:
5859
case AArch64::ADDSXri:
5860
case AArch64::SUBSXrr:
5861
// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5862
case AArch64::SUBXri:
5863
case AArch64::SUBSXri:
5864
case AArch64::ADDv8i8:
5865
case AArch64::ADDv16i8:
5866
case AArch64::ADDv4i16:
5867
case AArch64::ADDv8i16:
5868
case AArch64::ADDv2i32:
5869
case AArch64::ADDv4i32:
5870
case AArch64::SUBv8i8:
5871
case AArch64::SUBv16i8:
5872
case AArch64::SUBv4i16:
5873
case AArch64::SUBv8i16:
5874
case AArch64::SUBv2i32:
5875
case AArch64::SUBv4i32:
5876
return true;
5877
default:
5878
break;
5879
}
5880
return false;
5881
}
5882
5883
// FP Opcodes that can be combined with a FMUL.
5884
static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5885
switch (Inst.getOpcode()) {
5886
default:
5887
break;
5888
case AArch64::FADDHrr:
5889
case AArch64::FADDSrr:
5890
case AArch64::FADDDrr:
5891
case AArch64::FADDv4f16:
5892
case AArch64::FADDv8f16:
5893
case AArch64::FADDv2f32:
5894
case AArch64::FADDv2f64:
5895
case AArch64::FADDv4f32:
5896
case AArch64::FSUBHrr:
5897
case AArch64::FSUBSrr:
5898
case AArch64::FSUBDrr:
5899
case AArch64::FSUBv4f16:
5900
case AArch64::FSUBv8f16:
5901
case AArch64::FSUBv2f32:
5902
case AArch64::FSUBv2f64:
5903
case AArch64::FSUBv4f32:
5904
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5905
// We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5906
// the target options or if FADD/FSUB has the contract fast-math flag.
5907
return Options.UnsafeFPMath ||
5908
Options.AllowFPOpFusion == FPOpFusion::Fast ||
5909
Inst.getFlag(MachineInstr::FmContract);
5910
return true;
5911
}
5912
return false;
5913
}
5914
5915
// Opcodes that can be combined with a MUL
5916
static bool isCombineInstrCandidate(unsigned Opc) {
5917
return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5918
}
5919
5920
//
5921
// Utility routine that checks if \param MO is defined by an
5922
// \param CombineOpc instruction in the basic block \param MBB
5923
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5924
unsigned CombineOpc, unsigned ZeroReg = 0,
5925
bool CheckZeroReg = false) {
5926
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5927
MachineInstr *MI = nullptr;
5928
5929
if (MO.isReg() && MO.getReg().isVirtual())
5930
MI = MRI.getUniqueVRegDef(MO.getReg());
5931
// And it needs to be in the trace (otherwise, it won't have a depth).
5932
if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5933
return false;
5934
// Must only used by the user we combine with.
5935
if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
5936
return false;
5937
5938
if (CheckZeroReg) {
5939
assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5940
MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5941
MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5942
// The third input reg must be zero.
5943
if (MI->getOperand(3).getReg() != ZeroReg)
5944
return false;
5945
}
5946
5947
if (isCombineInstrSettingFlag(CombineOpc) &&
5948
MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
5949
return false;
5950
5951
return true;
5952
}
5953
5954
//
5955
// Is \param MO defined by an integer multiply and can be combined?
5956
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5957
unsigned MulOpc, unsigned ZeroReg) {
5958
return canCombine(MBB, MO, MulOpc, ZeroReg, true);
5959
}
5960
5961
//
5962
// Is \param MO defined by a floating-point multiply and can be combined?
5963
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5964
unsigned MulOpc) {
5965
return canCombine(MBB, MO, MulOpc);
5966
}
5967
5968
// TODO: There are many more machine instruction opcodes to match:
5969
// 1. Other data types (integer, vectors)
5970
// 2. Other math / logic operations (xor, or)
5971
// 3. Other forms of the same operation (intrinsics and other variants)
5972
bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5973
bool Invert) const {
5974
if (Invert)
5975
return false;
5976
switch (Inst.getOpcode()) {
5977
// == Floating-point types ==
5978
// -- Floating-point instructions --
5979
case AArch64::FADDHrr:
5980
case AArch64::FADDSrr:
5981
case AArch64::FADDDrr:
5982
case AArch64::FMULHrr:
5983
case AArch64::FMULSrr:
5984
case AArch64::FMULDrr:
5985
case AArch64::FMULX16:
5986
case AArch64::FMULX32:
5987
case AArch64::FMULX64:
5988
// -- Advanced SIMD instructions --
5989
case AArch64::FADDv4f16:
5990
case AArch64::FADDv8f16:
5991
case AArch64::FADDv2f32:
5992
case AArch64::FADDv4f32:
5993
case AArch64::FADDv2f64:
5994
case AArch64::FMULv4f16:
5995
case AArch64::FMULv8f16:
5996
case AArch64::FMULv2f32:
5997
case AArch64::FMULv4f32:
5998
case AArch64::FMULv2f64:
5999
case AArch64::FMULXv4f16:
6000
case AArch64::FMULXv8f16:
6001
case AArch64::FMULXv2f32:
6002
case AArch64::FMULXv4f32:
6003
case AArch64::FMULXv2f64:
6004
// -- SVE instructions --
6005
// Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6006
// in the SVE instruction set (though there are predicated ones).
6007
case AArch64::FADD_ZZZ_H:
6008
case AArch64::FADD_ZZZ_S:
6009
case AArch64::FADD_ZZZ_D:
6010
case AArch64::FMUL_ZZZ_H:
6011
case AArch64::FMUL_ZZZ_S:
6012
case AArch64::FMUL_ZZZ_D:
6013
return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6014
(Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6015
Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6016
6017
// == Integer types ==
6018
// -- Base instructions --
6019
// Opcodes MULWrr and MULXrr don't exist because
6020
// `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6021
// `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6022
// The machine-combiner does not support three-source-operands machine
6023
// instruction. So we cannot reassociate MULs.
6024
case AArch64::ADDWrr:
6025
case AArch64::ADDXrr:
6026
case AArch64::ANDWrr:
6027
case AArch64::ANDXrr:
6028
case AArch64::ORRWrr:
6029
case AArch64::ORRXrr:
6030
case AArch64::EORWrr:
6031
case AArch64::EORXrr:
6032
case AArch64::EONWrr:
6033
case AArch64::EONXrr:
6034
// -- Advanced SIMD instructions --
6035
// Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6036
// in the Advanced SIMD instruction set.
6037
case AArch64::ADDv8i8:
6038
case AArch64::ADDv16i8:
6039
case AArch64::ADDv4i16:
6040
case AArch64::ADDv8i16:
6041
case AArch64::ADDv2i32:
6042
case AArch64::ADDv4i32:
6043
case AArch64::ADDv1i64:
6044
case AArch64::ADDv2i64:
6045
case AArch64::MULv8i8:
6046
case AArch64::MULv16i8:
6047
case AArch64::MULv4i16:
6048
case AArch64::MULv8i16:
6049
case AArch64::MULv2i32:
6050
case AArch64::MULv4i32:
6051
case AArch64::ANDv8i8:
6052
case AArch64::ANDv16i8:
6053
case AArch64::ORRv8i8:
6054
case AArch64::ORRv16i8:
6055
case AArch64::EORv8i8:
6056
case AArch64::EORv16i8:
6057
// -- SVE instructions --
6058
case AArch64::ADD_ZZZ_B:
6059
case AArch64::ADD_ZZZ_H:
6060
case AArch64::ADD_ZZZ_S:
6061
case AArch64::ADD_ZZZ_D:
6062
case AArch64::MUL_ZZZ_B:
6063
case AArch64::MUL_ZZZ_H:
6064
case AArch64::MUL_ZZZ_S:
6065
case AArch64::MUL_ZZZ_D:
6066
case AArch64::AND_ZZZ:
6067
case AArch64::ORR_ZZZ:
6068
case AArch64::EOR_ZZZ:
6069
return true;
6070
6071
default:
6072
return false;
6073
}
6074
}
6075
6076
/// Find instructions that can be turned into madd.
6077
static bool getMaddPatterns(MachineInstr &Root,
6078
SmallVectorImpl<unsigned> &Patterns) {
6079
unsigned Opc = Root.getOpcode();
6080
MachineBasicBlock &MBB = *Root.getParent();
6081
bool Found = false;
6082
6083
if (!isCombineInstrCandidate(Opc))
6084
return false;
6085
if (isCombineInstrSettingFlag(Opc)) {
6086
int Cmp_NZCV =
6087
Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6088
// When NZCV is live bail out.
6089
if (Cmp_NZCV == -1)
6090
return false;
6091
unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6092
// When opcode can't change bail out.
6093
// CHECKME: do we miss any cases for opcode conversion?
6094
if (NewOpc == Opc)
6095
return false;
6096
Opc = NewOpc;
6097
}
6098
6099
auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6100
unsigned Pattern) {
6101
if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6102
Patterns.push_back(Pattern);
6103
Found = true;
6104
}
6105
};
6106
6107
auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6108
if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6109
Patterns.push_back(Pattern);
6110
Found = true;
6111
}
6112
};
6113
6114
typedef AArch64MachineCombinerPattern MCP;
6115
6116
switch (Opc) {
6117
default:
6118
break;
6119
case AArch64::ADDWrr:
6120
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6121
"ADDWrr does not have register operands");
6122
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6123
setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6124
break;
6125
case AArch64::ADDXrr:
6126
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6127
setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6128
break;
6129
case AArch64::SUBWrr:
6130
setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6131
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6132
break;
6133
case AArch64::SUBXrr:
6134
setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6135
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6136
break;
6137
case AArch64::ADDWri:
6138
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6139
break;
6140
case AArch64::ADDXri:
6141
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6142
break;
6143
case AArch64::SUBWri:
6144
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6145
break;
6146
case AArch64::SUBXri:
6147
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6148
break;
6149
case AArch64::ADDv8i8:
6150
setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6151
setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6152
break;
6153
case AArch64::ADDv16i8:
6154
setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6155
setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6156
break;
6157
case AArch64::ADDv4i16:
6158
setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6159
setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6160
setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6161
setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6162
break;
6163
case AArch64::ADDv8i16:
6164
setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6165
setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6166
setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6167
setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6168
break;
6169
case AArch64::ADDv2i32:
6170
setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6171
setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6172
setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6173
setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6174
break;
6175
case AArch64::ADDv4i32:
6176
setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6177
setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6178
setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6179
setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6180
break;
6181
case AArch64::SUBv8i8:
6182
setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6183
setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6184
break;
6185
case AArch64::SUBv16i8:
6186
setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6187
setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6188
break;
6189
case AArch64::SUBv4i16:
6190
setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6191
setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6192
setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6193
setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6194
break;
6195
case AArch64::SUBv8i16:
6196
setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6197
setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6198
setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6199
setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6200
break;
6201
case AArch64::SUBv2i32:
6202
setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6203
setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6204
setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6205
setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6206
break;
6207
case AArch64::SUBv4i32:
6208
setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6209
setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6210
setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6211
setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6212
break;
6213
}
6214
return Found;
6215
}
6216
/// Floating-Point Support
6217
6218
/// Find instructions that can be turned into madd.
6219
static bool getFMAPatterns(MachineInstr &Root,
6220
SmallVectorImpl<unsigned> &Patterns) {
6221
6222
if (!isCombineInstrCandidateFP(Root))
6223
return false;
6224
6225
MachineBasicBlock &MBB = *Root.getParent();
6226
bool Found = false;
6227
6228
auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6229
if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6230
Patterns.push_back(Pattern);
6231
return true;
6232
}
6233
return false;
6234
};
6235
6236
typedef AArch64MachineCombinerPattern MCP;
6237
6238
switch (Root.getOpcode()) {
6239
default:
6240
assert(false && "Unsupported FP instruction in combiner\n");
6241
break;
6242
case AArch64::FADDHrr:
6243
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6244
"FADDHrr does not have register operands");
6245
6246
Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6247
Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6248
break;
6249
case AArch64::FADDSrr:
6250
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6251
"FADDSrr does not have register operands");
6252
6253
Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6254
Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6255
6256
Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6257
Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6258
break;
6259
case AArch64::FADDDrr:
6260
Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6261
Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6262
6263
Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6264
Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6265
break;
6266
case AArch64::FADDv4f16:
6267
Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6268
Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6269
6270
Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6271
Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6272
break;
6273
case AArch64::FADDv8f16:
6274
Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6275
Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6276
6277
Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6278
Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6279
break;
6280
case AArch64::FADDv2f32:
6281
Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6282
Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6283
6284
Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6285
Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6286
break;
6287
case AArch64::FADDv2f64:
6288
Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6289
Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6290
6291
Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6292
Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6293
break;
6294
case AArch64::FADDv4f32:
6295
Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6296
Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6297
6298
Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6299
Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6300
break;
6301
case AArch64::FSUBHrr:
6302
Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6303
Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6304
Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6305
break;
6306
case AArch64::FSUBSrr:
6307
Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6308
6309
Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6310
Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6311
6312
Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6313
break;
6314
case AArch64::FSUBDrr:
6315
Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6316
6317
Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6318
Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6319
6320
Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6321
break;
6322
case AArch64::FSUBv4f16:
6323
Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6324
Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6325
6326
Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6327
Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6328
break;
6329
case AArch64::FSUBv8f16:
6330
Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6331
Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6332
6333
Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6334
Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6335
break;
6336
case AArch64::FSUBv2f32:
6337
Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6338
Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6339
6340
Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6341
Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6342
break;
6343
case AArch64::FSUBv2f64:
6344
Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6345
Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6346
6347
Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6348
Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6349
break;
6350
case AArch64::FSUBv4f32:
6351
Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6352
Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6353
6354
Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6355
Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6356
break;
6357
}
6358
return Found;
6359
}
6360
6361
static bool getFMULPatterns(MachineInstr &Root,
6362
SmallVectorImpl<unsigned> &Patterns) {
6363
MachineBasicBlock &MBB = *Root.getParent();
6364
bool Found = false;
6365
6366
auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6367
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6368
MachineOperand &MO = Root.getOperand(Operand);
6369
MachineInstr *MI = nullptr;
6370
if (MO.isReg() && MO.getReg().isVirtual())
6371
MI = MRI.getUniqueVRegDef(MO.getReg());
6372
// Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6373
if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6374
MI->getOperand(1).getReg().isVirtual())
6375
MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6376
if (MI && MI->getOpcode() == Opcode) {
6377
Patterns.push_back(Pattern);
6378
return true;
6379
}
6380
return false;
6381
};
6382
6383
typedef AArch64MachineCombinerPattern MCP;
6384
6385
switch (Root.getOpcode()) {
6386
default:
6387
return false;
6388
case AArch64::FMULv2f32:
6389
Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6390
Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6391
break;
6392
case AArch64::FMULv2f64:
6393
Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6394
Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6395
break;
6396
case AArch64::FMULv4f16:
6397
Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6398
Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6399
break;
6400
case AArch64::FMULv4f32:
6401
Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6402
Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6403
break;
6404
case AArch64::FMULv8f16:
6405
Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6406
Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6407
break;
6408
}
6409
6410
return Found;
6411
}
6412
6413
static bool getFNEGPatterns(MachineInstr &Root,
6414
SmallVectorImpl<unsigned> &Patterns) {
6415
unsigned Opc = Root.getOpcode();
6416
MachineBasicBlock &MBB = *Root.getParent();
6417
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6418
6419
auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6420
MachineOperand &MO = Root.getOperand(1);
6421
MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6422
if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6423
MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6424
Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6425
Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6426
MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6427
MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6428
Patterns.push_back(Pattern);
6429
return true;
6430
}
6431
return false;
6432
};
6433
6434
switch (Opc) {
6435
default:
6436
break;
6437
case AArch64::FNEGDr:
6438
return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6439
case AArch64::FNEGSr:
6440
return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6441
}
6442
6443
return false;
6444
}
6445
6446
/// Return true when a code sequence can improve throughput. It
6447
/// should be called only for instructions in loops.
6448
/// \param Pattern - combiner pattern
6449
bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6450
switch (Pattern) {
6451
default:
6452
break;
6453
case AArch64MachineCombinerPattern::FMULADDH_OP1:
6454
case AArch64MachineCombinerPattern::FMULADDH_OP2:
6455
case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6456
case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6457
case AArch64MachineCombinerPattern::FMULADDS_OP1:
6458
case AArch64MachineCombinerPattern::FMULADDS_OP2:
6459
case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6460
case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6461
case AArch64MachineCombinerPattern::FMULADDD_OP1:
6462
case AArch64MachineCombinerPattern::FMULADDD_OP2:
6463
case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6464
case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6465
case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6466
case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6467
case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6468
case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6469
case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6470
case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6471
case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6472
case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6473
case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6474
case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6475
case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6476
case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6477
case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6478
case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6479
case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6480
case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6481
case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6482
case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6483
case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6484
case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6485
case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6486
case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6487
case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6488
case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6489
case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6490
case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6491
case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6492
case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6493
case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6494
case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6495
case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6496
case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6497
case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6498
case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6499
case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6500
case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6501
case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6502
case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6503
case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6504
case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6505
case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6506
case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6507
case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6508
case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6509
case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6510
case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6511
case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6512
case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6513
case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6514
case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6515
case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6516
case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6517
case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6518
case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6519
case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6520
case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6521
case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6522
case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6523
case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6524
case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6525
case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6526
case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6527
case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6528
case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6529
case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6530
case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6531
case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6532
case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6533
case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6534
case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6535
case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6536
case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6537
case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6538
case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6539
case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6540
case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6541
case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6542
case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6543
case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6544
case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6545
case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6546
case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6547
case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6548
case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6549
case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6550
case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6551
case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6552
case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6553
case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6554
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6555
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6556
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6557
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6558
return true;
6559
} // end switch (Pattern)
6560
return false;
6561
}
6562
6563
/// Find other MI combine patterns.
6564
static bool getMiscPatterns(MachineInstr &Root,
6565
SmallVectorImpl<unsigned> &Patterns) {
6566
// A - (B + C) ==> (A - B) - C or (A - C) - B
6567
unsigned Opc = Root.getOpcode();
6568
MachineBasicBlock &MBB = *Root.getParent();
6569
6570
switch (Opc) {
6571
case AArch64::SUBWrr:
6572
case AArch64::SUBSWrr:
6573
case AArch64::SUBXrr:
6574
case AArch64::SUBSXrr:
6575
// Found candidate root.
6576
break;
6577
default:
6578
return false;
6579
}
6580
6581
if (isCombineInstrSettingFlag(Opc) &&
6582
Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
6583
-1)
6584
return false;
6585
6586
if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6587
canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6588
canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6589
canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6590
Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
6591
Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
6592
return true;
6593
}
6594
6595
return false;
6596
}
6597
6598
CombinerObjective
6599
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
6600
switch (Pattern) {
6601
case AArch64MachineCombinerPattern::SUBADD_OP1:
6602
case AArch64MachineCombinerPattern::SUBADD_OP2:
6603
return CombinerObjective::MustReduceDepth;
6604
default:
6605
return TargetInstrInfo::getCombinerObjective(Pattern);
6606
}
6607
}
6608
6609
/// Return true when there is potentially a faster code sequence for an
6610
/// instruction chain ending in \p Root. All potential patterns are listed in
6611
/// the \p Pattern vector. Pattern should be sorted in priority order since the
6612
/// pattern evaluator stops checking as soon as it finds a faster sequence.
6613
6614
bool AArch64InstrInfo::getMachineCombinerPatterns(
6615
MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
6616
bool DoRegPressureReduce) const {
6617
// Integer patterns
6618
if (getMaddPatterns(Root, Patterns))
6619
return true;
6620
// Floating point patterns
6621
if (getFMULPatterns(Root, Patterns))
6622
return true;
6623
if (getFMAPatterns(Root, Patterns))
6624
return true;
6625
if (getFNEGPatterns(Root, Patterns))
6626
return true;
6627
6628
// Other patterns
6629
if (getMiscPatterns(Root, Patterns))
6630
return true;
6631
6632
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6633
DoRegPressureReduce);
6634
}
6635
6636
enum class FMAInstKind { Default, Indexed, Accumulator };
6637
/// genFusedMultiply - Generate fused multiply instructions.
6638
/// This function supports both integer and floating point instructions.
6639
/// A typical example:
6640
/// F|MUL I=A,B,0
6641
/// F|ADD R,I,C
6642
/// ==> F|MADD R,A,B,C
6643
/// \param MF Containing MachineFunction
6644
/// \param MRI Register information
6645
/// \param TII Target information
6646
/// \param Root is the F|ADD instruction
6647
/// \param [out] InsInstrs is a vector of machine instructions and will
6648
/// contain the generated madd instruction
6649
/// \param IdxMulOpd is index of operand in Root that is the result of
6650
/// the F|MUL. In the example above IdxMulOpd is 1.
6651
/// \param MaddOpc the opcode fo the f|madd instruction
6652
/// \param RC Register class of operands
6653
/// \param kind of fma instruction (addressing mode) to be generated
6654
/// \param ReplacedAddend is the result register from the instruction
6655
/// replacing the non-combined operand, if any.
6656
static MachineInstr *
6657
genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6658
const TargetInstrInfo *TII, MachineInstr &Root,
6659
SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6660
unsigned MaddOpc, const TargetRegisterClass *RC,
6661
FMAInstKind kind = FMAInstKind::Default,
6662
const Register *ReplacedAddend = nullptr) {
6663
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6664
6665
unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6666
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6667
Register ResultReg = Root.getOperand(0).getReg();
6668
Register SrcReg0 = MUL->getOperand(1).getReg();
6669
bool Src0IsKill = MUL->getOperand(1).isKill();
6670
Register SrcReg1 = MUL->getOperand(2).getReg();
6671
bool Src1IsKill = MUL->getOperand(2).isKill();
6672
6673
Register SrcReg2;
6674
bool Src2IsKill;
6675
if (ReplacedAddend) {
6676
// If we just generated a new addend, we must be it's only use.
6677
SrcReg2 = *ReplacedAddend;
6678
Src2IsKill = true;
6679
} else {
6680
SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
6681
Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
6682
}
6683
6684
if (ResultReg.isVirtual())
6685
MRI.constrainRegClass(ResultReg, RC);
6686
if (SrcReg0.isVirtual())
6687
MRI.constrainRegClass(SrcReg0, RC);
6688
if (SrcReg1.isVirtual())
6689
MRI.constrainRegClass(SrcReg1, RC);
6690
if (SrcReg2.isVirtual())
6691
MRI.constrainRegClass(SrcReg2, RC);
6692
6693
MachineInstrBuilder MIB;
6694
if (kind == FMAInstKind::Default)
6695
MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6696
.addReg(SrcReg0, getKillRegState(Src0IsKill))
6697
.addReg(SrcReg1, getKillRegState(Src1IsKill))
6698
.addReg(SrcReg2, getKillRegState(Src2IsKill));
6699
else if (kind == FMAInstKind::Indexed)
6700
MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6701
.addReg(SrcReg2, getKillRegState(Src2IsKill))
6702
.addReg(SrcReg0, getKillRegState(Src0IsKill))
6703
.addReg(SrcReg1, getKillRegState(Src1IsKill))
6704
.addImm(MUL->getOperand(3).getImm());
6705
else if (kind == FMAInstKind::Accumulator)
6706
MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6707
.addReg(SrcReg2, getKillRegState(Src2IsKill))
6708
.addReg(SrcReg0, getKillRegState(Src0IsKill))
6709
.addReg(SrcReg1, getKillRegState(Src1IsKill));
6710
else
6711
assert(false && "Invalid FMA instruction kind \n");
6712
// Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6713
InsInstrs.push_back(MIB);
6714
return MUL;
6715
}
6716
6717
static MachineInstr *
6718
genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6719
const TargetInstrInfo *TII, MachineInstr &Root,
6720
SmallVectorImpl<MachineInstr *> &InsInstrs) {
6721
MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
6722
6723
unsigned Opc = 0;
6724
const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
6725
if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6726
Opc = AArch64::FNMADDSrrr;
6727
else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6728
Opc = AArch64::FNMADDDrrr;
6729
else
6730
return nullptr;
6731
6732
Register ResultReg = Root.getOperand(0).getReg();
6733
Register SrcReg0 = MAD->getOperand(1).getReg();
6734
Register SrcReg1 = MAD->getOperand(2).getReg();
6735
Register SrcReg2 = MAD->getOperand(3).getReg();
6736
bool Src0IsKill = MAD->getOperand(1).isKill();
6737
bool Src1IsKill = MAD->getOperand(2).isKill();
6738
bool Src2IsKill = MAD->getOperand(3).isKill();
6739
if (ResultReg.isVirtual())
6740
MRI.constrainRegClass(ResultReg, RC);
6741
if (SrcReg0.isVirtual())
6742
MRI.constrainRegClass(SrcReg0, RC);
6743
if (SrcReg1.isVirtual())
6744
MRI.constrainRegClass(SrcReg1, RC);
6745
if (SrcReg2.isVirtual())
6746
MRI.constrainRegClass(SrcReg2, RC);
6747
6748
MachineInstrBuilder MIB =
6749
BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
6750
.addReg(SrcReg0, getKillRegState(Src0IsKill))
6751
.addReg(SrcReg1, getKillRegState(Src1IsKill))
6752
.addReg(SrcReg2, getKillRegState(Src2IsKill));
6753
InsInstrs.push_back(MIB);
6754
6755
return MAD;
6756
}
6757
6758
/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6759
static MachineInstr *
6760
genIndexedMultiply(MachineInstr &Root,
6761
SmallVectorImpl<MachineInstr *> &InsInstrs,
6762
unsigned IdxDupOp, unsigned MulOpc,
6763
const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6764
assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6765
"Invalid index of FMUL operand");
6766
6767
MachineFunction &MF = *Root.getMF();
6768
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6769
6770
MachineInstr *Dup =
6771
MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
6772
6773
if (Dup->getOpcode() == TargetOpcode::COPY)
6774
Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
6775
6776
Register DupSrcReg = Dup->getOperand(1).getReg();
6777
MRI.clearKillFlags(DupSrcReg);
6778
MRI.constrainRegClass(DupSrcReg, RC);
6779
6780
unsigned DupSrcLane = Dup->getOperand(2).getImm();
6781
6782
unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6783
MachineOperand &MulOp = Root.getOperand(IdxMulOp);
6784
6785
Register ResultReg = Root.getOperand(0).getReg();
6786
6787
MachineInstrBuilder MIB;
6788
MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
6789
.add(MulOp)
6790
.addReg(DupSrcReg)
6791
.addImm(DupSrcLane);
6792
6793
InsInstrs.push_back(MIB);
6794
return &Root;
6795
}
6796
6797
/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6798
/// instructions.
6799
///
6800
/// \see genFusedMultiply
6801
static MachineInstr *genFusedMultiplyAcc(
6802
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6803
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6804
unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6805
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6806
FMAInstKind::Accumulator);
6807
}
6808
6809
/// genNeg - Helper to generate an intermediate negation of the second operand
6810
/// of Root
6811
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6812
const TargetInstrInfo *TII, MachineInstr &Root,
6813
SmallVectorImpl<MachineInstr *> &InsInstrs,
6814
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6815
unsigned MnegOpc, const TargetRegisterClass *RC) {
6816
Register NewVR = MRI.createVirtualRegister(RC);
6817
MachineInstrBuilder MIB =
6818
BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
6819
.add(Root.getOperand(2));
6820
InsInstrs.push_back(MIB);
6821
6822
assert(InstrIdxForVirtReg.empty());
6823
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6824
6825
return NewVR;
6826
}
6827
6828
/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6829
/// instructions with an additional negation of the accumulator
6830
static MachineInstr *genFusedMultiplyAccNeg(
6831
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6832
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6833
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6834
unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6835
assert(IdxMulOpd == 1);
6836
6837
Register NewVR =
6838
genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6839
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6840
FMAInstKind::Accumulator, &NewVR);
6841
}
6842
6843
/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6844
/// instructions.
6845
///
6846
/// \see genFusedMultiply
6847
static MachineInstr *genFusedMultiplyIdx(
6848
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6849
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6850
unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6851
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6852
FMAInstKind::Indexed);
6853
}
6854
6855
/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6856
/// instructions with an additional negation of the accumulator
6857
static MachineInstr *genFusedMultiplyIdxNeg(
6858
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6859
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6860
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6861
unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6862
assert(IdxMulOpd == 1);
6863
6864
Register NewVR =
6865
genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6866
6867
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6868
FMAInstKind::Indexed, &NewVR);
6869
}
6870
6871
/// genMaddR - Generate madd instruction and combine mul and add using
6872
/// an extra virtual register
6873
/// Example - an ADD intermediate needs to be stored in a register:
6874
/// MUL I=A,B,0
6875
/// ADD R,I,Imm
6876
/// ==> ORR V, ZR, Imm
6877
/// ==> MADD R,A,B,V
6878
/// \param MF Containing MachineFunction
6879
/// \param MRI Register information
6880
/// \param TII Target information
6881
/// \param Root is the ADD instruction
6882
/// \param [out] InsInstrs is a vector of machine instructions and will
6883
/// contain the generated madd instruction
6884
/// \param IdxMulOpd is index of operand in Root that is the result of
6885
/// the MUL. In the example above IdxMulOpd is 1.
6886
/// \param MaddOpc the opcode fo the madd instruction
6887
/// \param VR is a virtual register that holds the value of an ADD operand
6888
/// (V in the example above).
6889
/// \param RC Register class of operands
6890
static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6891
const TargetInstrInfo *TII, MachineInstr &Root,
6892
SmallVectorImpl<MachineInstr *> &InsInstrs,
6893
unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6894
const TargetRegisterClass *RC) {
6895
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6896
6897
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6898
Register ResultReg = Root.getOperand(0).getReg();
6899
Register SrcReg0 = MUL->getOperand(1).getReg();
6900
bool Src0IsKill = MUL->getOperand(1).isKill();
6901
Register SrcReg1 = MUL->getOperand(2).getReg();
6902
bool Src1IsKill = MUL->getOperand(2).isKill();
6903
6904
if (ResultReg.isVirtual())
6905
MRI.constrainRegClass(ResultReg, RC);
6906
if (SrcReg0.isVirtual())
6907
MRI.constrainRegClass(SrcReg0, RC);
6908
if (SrcReg1.isVirtual())
6909
MRI.constrainRegClass(SrcReg1, RC);
6910
if (Register::isVirtualRegister(VR))
6911
MRI.constrainRegClass(VR, RC);
6912
6913
MachineInstrBuilder MIB =
6914
BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6915
.addReg(SrcReg0, getKillRegState(Src0IsKill))
6916
.addReg(SrcReg1, getKillRegState(Src1IsKill))
6917
.addReg(VR);
6918
// Insert the MADD
6919
InsInstrs.push_back(MIB);
6920
return MUL;
6921
}
6922
6923
/// Do the following transformation
6924
/// A - (B + C) ==> (A - B) - C
6925
/// A - (B + C) ==> (A - C) - B
6926
static void
6927
genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6928
const TargetInstrInfo *TII, MachineInstr &Root,
6929
SmallVectorImpl<MachineInstr *> &InsInstrs,
6930
SmallVectorImpl<MachineInstr *> &DelInstrs,
6931
unsigned IdxOpd1,
6932
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6933
assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6934
unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6935
MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
6936
6937
Register ResultReg = Root.getOperand(0).getReg();
6938
Register RegA = Root.getOperand(1).getReg();
6939
bool RegAIsKill = Root.getOperand(1).isKill();
6940
Register RegB = AddMI->getOperand(IdxOpd1).getReg();
6941
bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
6942
Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
6943
bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
6944
Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
6945
6946
unsigned Opcode = Root.getOpcode();
6947
if (Opcode == AArch64::SUBSWrr)
6948
Opcode = AArch64::SUBWrr;
6949
else if (Opcode == AArch64::SUBSXrr)
6950
Opcode = AArch64::SUBXrr;
6951
else
6952
assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6953
"Unexpected instruction opcode.");
6954
6955
uint32_t Flags = Root.mergeFlagsWith(*AddMI);
6956
Flags &= ~MachineInstr::NoSWrap;
6957
Flags &= ~MachineInstr::NoUWrap;
6958
6959
MachineInstrBuilder MIB1 =
6960
BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
6961
.addReg(RegA, getKillRegState(RegAIsKill))
6962
.addReg(RegB, getKillRegState(RegBIsKill))
6963
.setMIFlags(Flags);
6964
MachineInstrBuilder MIB2 =
6965
BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
6966
.addReg(NewVR, getKillRegState(true))
6967
.addReg(RegC, getKillRegState(RegCIsKill))
6968
.setMIFlags(Flags);
6969
6970
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6971
InsInstrs.push_back(MIB1);
6972
InsInstrs.push_back(MIB2);
6973
DelInstrs.push_back(AddMI);
6974
DelInstrs.push_back(&Root);
6975
}
6976
6977
/// When getMachineCombinerPatterns() finds potential patterns,
6978
/// this function generates the instructions that could replace the
6979
/// original code sequence
6980
void AArch64InstrInfo::genAlternativeCodeSequence(
6981
MachineInstr &Root, unsigned Pattern,
6982
SmallVectorImpl<MachineInstr *> &InsInstrs,
6983
SmallVectorImpl<MachineInstr *> &DelInstrs,
6984
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6985
MachineBasicBlock &MBB = *Root.getParent();
6986
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6987
MachineFunction &MF = *MBB.getParent();
6988
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6989
6990
MachineInstr *MUL = nullptr;
6991
const TargetRegisterClass *RC;
6992
unsigned Opc;
6993
switch (Pattern) {
6994
default:
6995
// Reassociate instructions.
6996
TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6997
DelInstrs, InstrIdxForVirtReg);
6998
return;
6999
case AArch64MachineCombinerPattern::SUBADD_OP1:
7000
// A - (B + C)
7001
// ==> (A - B) - C
7002
genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7003
InstrIdxForVirtReg);
7004
return;
7005
case AArch64MachineCombinerPattern::SUBADD_OP2:
7006
// A - (B + C)
7007
// ==> (A - C) - B
7008
genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7009
InstrIdxForVirtReg);
7010
return;
7011
case AArch64MachineCombinerPattern::MULADDW_OP1:
7012
case AArch64MachineCombinerPattern::MULADDX_OP1:
7013
// MUL I=A,B,0
7014
// ADD R,I,C
7015
// ==> MADD R,A,B,C
7016
// --- Create(MADD);
7017
if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7018
Opc = AArch64::MADDWrrr;
7019
RC = &AArch64::GPR32RegClass;
7020
} else {
7021
Opc = AArch64::MADDXrrr;
7022
RC = &AArch64::GPR64RegClass;
7023
}
7024
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7025
break;
7026
case AArch64MachineCombinerPattern::MULADDW_OP2:
7027
case AArch64MachineCombinerPattern::MULADDX_OP2:
7028
// MUL I=A,B,0
7029
// ADD R,C,I
7030
// ==> MADD R,A,B,C
7031
// --- Create(MADD);
7032
if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7033
Opc = AArch64::MADDWrrr;
7034
RC = &AArch64::GPR32RegClass;
7035
} else {
7036
Opc = AArch64::MADDXrrr;
7037
RC = &AArch64::GPR64RegClass;
7038
}
7039
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7040
break;
7041
case AArch64MachineCombinerPattern::MULADDWI_OP1:
7042
case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7043
// MUL I=A,B,0
7044
// ADD R,I,Imm
7045
// ==> MOV V, Imm
7046
// ==> MADD R,A,B,V
7047
// --- Create(MADD);
7048
const TargetRegisterClass *OrrRC;
7049
unsigned BitSize, OrrOpc, ZeroReg;
7050
if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7051
OrrOpc = AArch64::ORRWri;
7052
OrrRC = &AArch64::GPR32spRegClass;
7053
BitSize = 32;
7054
ZeroReg = AArch64::WZR;
7055
Opc = AArch64::MADDWrrr;
7056
RC = &AArch64::GPR32RegClass;
7057
} else {
7058
OrrOpc = AArch64::ORRXri;
7059
OrrRC = &AArch64::GPR64spRegClass;
7060
BitSize = 64;
7061
ZeroReg = AArch64::XZR;
7062
Opc = AArch64::MADDXrrr;
7063
RC = &AArch64::GPR64RegClass;
7064
}
7065
Register NewVR = MRI.createVirtualRegister(OrrRC);
7066
uint64_t Imm = Root.getOperand(2).getImm();
7067
7068
if (Root.getOperand(3).isImm()) {
7069
unsigned Val = Root.getOperand(3).getImm();
7070
Imm = Imm << Val;
7071
}
7072
uint64_t UImm = SignExtend64(Imm, BitSize);
7073
// The immediate can be composed via a single instruction.
7074
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7075
AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7076
if (Insn.size() != 1)
7077
return;
7078
auto MovI = Insn.begin();
7079
MachineInstrBuilder MIB1;
7080
// MOV is an alias for one of three instructions: movz, movn, and orr.
7081
if (MovI->Opcode == OrrOpc)
7082
MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7083
.addReg(ZeroReg)
7084
.addImm(MovI->Op2);
7085
else {
7086
if (BitSize == 32)
7087
assert((MovI->Opcode == AArch64::MOVNWi ||
7088
MovI->Opcode == AArch64::MOVZWi) &&
7089
"Expected opcode");
7090
else
7091
assert((MovI->Opcode == AArch64::MOVNXi ||
7092
MovI->Opcode == AArch64::MOVZXi) &&
7093
"Expected opcode");
7094
MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7095
.addImm(MovI->Op1)
7096
.addImm(MovI->Op2);
7097
}
7098
InsInstrs.push_back(MIB1);
7099
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7100
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7101
break;
7102
}
7103
case AArch64MachineCombinerPattern::MULSUBW_OP1:
7104
case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7105
// MUL I=A,B,0
7106
// SUB R,I, C
7107
// ==> SUB V, 0, C
7108
// ==> MADD R,A,B,V // = -C + A*B
7109
// --- Create(MADD);
7110
const TargetRegisterClass *SubRC;
7111
unsigned SubOpc, ZeroReg;
7112
if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7113
SubOpc = AArch64::SUBWrr;
7114
SubRC = &AArch64::GPR32spRegClass;
7115
ZeroReg = AArch64::WZR;
7116
Opc = AArch64::MADDWrrr;
7117
RC = &AArch64::GPR32RegClass;
7118
} else {
7119
SubOpc = AArch64::SUBXrr;
7120
SubRC = &AArch64::GPR64spRegClass;
7121
ZeroReg = AArch64::XZR;
7122
Opc = AArch64::MADDXrrr;
7123
RC = &AArch64::GPR64RegClass;
7124
}
7125
Register NewVR = MRI.createVirtualRegister(SubRC);
7126
// SUB NewVR, 0, C
7127
MachineInstrBuilder MIB1 =
7128
BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7129
.addReg(ZeroReg)
7130
.add(Root.getOperand(2));
7131
InsInstrs.push_back(MIB1);
7132
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7133
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7134
break;
7135
}
7136
case AArch64MachineCombinerPattern::MULSUBW_OP2:
7137
case AArch64MachineCombinerPattern::MULSUBX_OP2:
7138
// MUL I=A,B,0
7139
// SUB R,C,I
7140
// ==> MSUB R,A,B,C (computes C - A*B)
7141
// --- Create(MSUB);
7142
if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7143
Opc = AArch64::MSUBWrrr;
7144
RC = &AArch64::GPR32RegClass;
7145
} else {
7146
Opc = AArch64::MSUBXrrr;
7147
RC = &AArch64::GPR64RegClass;
7148
}
7149
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7150
break;
7151
case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7152
case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7153
// MUL I=A,B,0
7154
// SUB R,I, Imm
7155
// ==> MOV V, -Imm
7156
// ==> MADD R,A,B,V // = -Imm + A*B
7157
// --- Create(MADD);
7158
const TargetRegisterClass *OrrRC;
7159
unsigned BitSize, OrrOpc, ZeroReg;
7160
if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7161
OrrOpc = AArch64::ORRWri;
7162
OrrRC = &AArch64::GPR32spRegClass;
7163
BitSize = 32;
7164
ZeroReg = AArch64::WZR;
7165
Opc = AArch64::MADDWrrr;
7166
RC = &AArch64::GPR32RegClass;
7167
} else {
7168
OrrOpc = AArch64::ORRXri;
7169
OrrRC = &AArch64::GPR64spRegClass;
7170
BitSize = 64;
7171
ZeroReg = AArch64::XZR;
7172
Opc = AArch64::MADDXrrr;
7173
RC = &AArch64::GPR64RegClass;
7174
}
7175
Register NewVR = MRI.createVirtualRegister(OrrRC);
7176
uint64_t Imm = Root.getOperand(2).getImm();
7177
if (Root.getOperand(3).isImm()) {
7178
unsigned Val = Root.getOperand(3).getImm();
7179
Imm = Imm << Val;
7180
}
7181
uint64_t UImm = SignExtend64(-Imm, BitSize);
7182
// The immediate can be composed via a single instruction.
7183
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7184
AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7185
if (Insn.size() != 1)
7186
return;
7187
auto MovI = Insn.begin();
7188
MachineInstrBuilder MIB1;
7189
// MOV is an alias for one of three instructions: movz, movn, and orr.
7190
if (MovI->Opcode == OrrOpc)
7191
MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7192
.addReg(ZeroReg)
7193
.addImm(MovI->Op2);
7194
else {
7195
if (BitSize == 32)
7196
assert((MovI->Opcode == AArch64::MOVNWi ||
7197
MovI->Opcode == AArch64::MOVZWi) &&
7198
"Expected opcode");
7199
else
7200
assert((MovI->Opcode == AArch64::MOVNXi ||
7201
MovI->Opcode == AArch64::MOVZXi) &&
7202
"Expected opcode");
7203
MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7204
.addImm(MovI->Op1)
7205
.addImm(MovI->Op2);
7206
}
7207
InsInstrs.push_back(MIB1);
7208
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7209
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7210
break;
7211
}
7212
7213
case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7214
Opc = AArch64::MLAv8i8;
7215
RC = &AArch64::FPR64RegClass;
7216
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7217
break;
7218
case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7219
Opc = AArch64::MLAv8i8;
7220
RC = &AArch64::FPR64RegClass;
7221
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7222
break;
7223
case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7224
Opc = AArch64::MLAv16i8;
7225
RC = &AArch64::FPR128RegClass;
7226
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7227
break;
7228
case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7229
Opc = AArch64::MLAv16i8;
7230
RC = &AArch64::FPR128RegClass;
7231
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7232
break;
7233
case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7234
Opc = AArch64::MLAv4i16;
7235
RC = &AArch64::FPR64RegClass;
7236
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7237
break;
7238
case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7239
Opc = AArch64::MLAv4i16;
7240
RC = &AArch64::FPR64RegClass;
7241
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7242
break;
7243
case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7244
Opc = AArch64::MLAv8i16;
7245
RC = &AArch64::FPR128RegClass;
7246
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7247
break;
7248
case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7249
Opc = AArch64::MLAv8i16;
7250
RC = &AArch64::FPR128RegClass;
7251
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7252
break;
7253
case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7254
Opc = AArch64::MLAv2i32;
7255
RC = &AArch64::FPR64RegClass;
7256
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7257
break;
7258
case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7259
Opc = AArch64::MLAv2i32;
7260
RC = &AArch64::FPR64RegClass;
7261
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7262
break;
7263
case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7264
Opc = AArch64::MLAv4i32;
7265
RC = &AArch64::FPR128RegClass;
7266
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7267
break;
7268
case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7269
Opc = AArch64::MLAv4i32;
7270
RC = &AArch64::FPR128RegClass;
7271
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7272
break;
7273
7274
case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7275
Opc = AArch64::MLAv8i8;
7276
RC = &AArch64::FPR64RegClass;
7277
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7278
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7279
RC);
7280
break;
7281
case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7282
Opc = AArch64::MLSv8i8;
7283
RC = &AArch64::FPR64RegClass;
7284
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7285
break;
7286
case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7287
Opc = AArch64::MLAv16i8;
7288
RC = &AArch64::FPR128RegClass;
7289
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7290
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7291
RC);
7292
break;
7293
case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7294
Opc = AArch64::MLSv16i8;
7295
RC = &AArch64::FPR128RegClass;
7296
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7297
break;
7298
case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7299
Opc = AArch64::MLAv4i16;
7300
RC = &AArch64::FPR64RegClass;
7301
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7302
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7303
RC);
7304
break;
7305
case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7306
Opc = AArch64::MLSv4i16;
7307
RC = &AArch64::FPR64RegClass;
7308
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7309
break;
7310
case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7311
Opc = AArch64::MLAv8i16;
7312
RC = &AArch64::FPR128RegClass;
7313
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7314
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7315
RC);
7316
break;
7317
case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7318
Opc = AArch64::MLSv8i16;
7319
RC = &AArch64::FPR128RegClass;
7320
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7321
break;
7322
case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7323
Opc = AArch64::MLAv2i32;
7324
RC = &AArch64::FPR64RegClass;
7325
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7326
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7327
RC);
7328
break;
7329
case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7330
Opc = AArch64::MLSv2i32;
7331
RC = &AArch64::FPR64RegClass;
7332
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7333
break;
7334
case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7335
Opc = AArch64::MLAv4i32;
7336
RC = &AArch64::FPR128RegClass;
7337
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7338
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7339
RC);
7340
break;
7341
case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7342
Opc = AArch64::MLSv4i32;
7343
RC = &AArch64::FPR128RegClass;
7344
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7345
break;
7346
7347
case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7348
Opc = AArch64::MLAv4i16_indexed;
7349
RC = &AArch64::FPR64RegClass;
7350
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7351
break;
7352
case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7353
Opc = AArch64::MLAv4i16_indexed;
7354
RC = &AArch64::FPR64RegClass;
7355
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7356
break;
7357
case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7358
Opc = AArch64::MLAv8i16_indexed;
7359
RC = &AArch64::FPR128RegClass;
7360
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7361
break;
7362
case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7363
Opc = AArch64::MLAv8i16_indexed;
7364
RC = &AArch64::FPR128RegClass;
7365
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7366
break;
7367
case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7368
Opc = AArch64::MLAv2i32_indexed;
7369
RC = &AArch64::FPR64RegClass;
7370
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7371
break;
7372
case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7373
Opc = AArch64::MLAv2i32_indexed;
7374
RC = &AArch64::FPR64RegClass;
7375
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7376
break;
7377
case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7378
Opc = AArch64::MLAv4i32_indexed;
7379
RC = &AArch64::FPR128RegClass;
7380
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7381
break;
7382
case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7383
Opc = AArch64::MLAv4i32_indexed;
7384
RC = &AArch64::FPR128RegClass;
7385
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7386
break;
7387
7388
case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7389
Opc = AArch64::MLAv4i16_indexed;
7390
RC = &AArch64::FPR64RegClass;
7391
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7392
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7393
RC);
7394
break;
7395
case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7396
Opc = AArch64::MLSv4i16_indexed;
7397
RC = &AArch64::FPR64RegClass;
7398
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7399
break;
7400
case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7401
Opc = AArch64::MLAv8i16_indexed;
7402
RC = &AArch64::FPR128RegClass;
7403
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7404
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7405
RC);
7406
break;
7407
case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7408
Opc = AArch64::MLSv8i16_indexed;
7409
RC = &AArch64::FPR128RegClass;
7410
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7411
break;
7412
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7413
Opc = AArch64::MLAv2i32_indexed;
7414
RC = &AArch64::FPR64RegClass;
7415
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7416
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7417
RC);
7418
break;
7419
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7420
Opc = AArch64::MLSv2i32_indexed;
7421
RC = &AArch64::FPR64RegClass;
7422
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7423
break;
7424
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7425
Opc = AArch64::MLAv4i32_indexed;
7426
RC = &AArch64::FPR128RegClass;
7427
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7428
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7429
RC);
7430
break;
7431
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7432
Opc = AArch64::MLSv4i32_indexed;
7433
RC = &AArch64::FPR128RegClass;
7434
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7435
break;
7436
7437
// Floating Point Support
7438
case AArch64MachineCombinerPattern::FMULADDH_OP1:
7439
Opc = AArch64::FMADDHrrr;
7440
RC = &AArch64::FPR16RegClass;
7441
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7442
break;
7443
case AArch64MachineCombinerPattern::FMULADDS_OP1:
7444
Opc = AArch64::FMADDSrrr;
7445
RC = &AArch64::FPR32RegClass;
7446
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7447
break;
7448
case AArch64MachineCombinerPattern::FMULADDD_OP1:
7449
Opc = AArch64::FMADDDrrr;
7450
RC = &AArch64::FPR64RegClass;
7451
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7452
break;
7453
7454
case AArch64MachineCombinerPattern::FMULADDH_OP2:
7455
Opc = AArch64::FMADDHrrr;
7456
RC = &AArch64::FPR16RegClass;
7457
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7458
break;
7459
case AArch64MachineCombinerPattern::FMULADDS_OP2:
7460
Opc = AArch64::FMADDSrrr;
7461
RC = &AArch64::FPR32RegClass;
7462
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7463
break;
7464
case AArch64MachineCombinerPattern::FMULADDD_OP2:
7465
Opc = AArch64::FMADDDrrr;
7466
RC = &AArch64::FPR64RegClass;
7467
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7468
break;
7469
7470
case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7471
Opc = AArch64::FMLAv1i32_indexed;
7472
RC = &AArch64::FPR32RegClass;
7473
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7474
FMAInstKind::Indexed);
7475
break;
7476
case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7477
Opc = AArch64::FMLAv1i32_indexed;
7478
RC = &AArch64::FPR32RegClass;
7479
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7480
FMAInstKind::Indexed);
7481
break;
7482
7483
case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7484
Opc = AArch64::FMLAv1i64_indexed;
7485
RC = &AArch64::FPR64RegClass;
7486
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7487
FMAInstKind::Indexed);
7488
break;
7489
case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7490
Opc = AArch64::FMLAv1i64_indexed;
7491
RC = &AArch64::FPR64RegClass;
7492
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7493
FMAInstKind::Indexed);
7494
break;
7495
7496
case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7497
RC = &AArch64::FPR64RegClass;
7498
Opc = AArch64::FMLAv4i16_indexed;
7499
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7500
FMAInstKind::Indexed);
7501
break;
7502
case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7503
RC = &AArch64::FPR64RegClass;
7504
Opc = AArch64::FMLAv4f16;
7505
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7506
FMAInstKind::Accumulator);
7507
break;
7508
case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7509
RC = &AArch64::FPR64RegClass;
7510
Opc = AArch64::FMLAv4i16_indexed;
7511
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7512
FMAInstKind::Indexed);
7513
break;
7514
case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7515
RC = &AArch64::FPR64RegClass;
7516
Opc = AArch64::FMLAv4f16;
7517
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7518
FMAInstKind::Accumulator);
7519
break;
7520
7521
case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7522
case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7523
RC = &AArch64::FPR64RegClass;
7524
if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7525
Opc = AArch64::FMLAv2i32_indexed;
7526
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7527
FMAInstKind::Indexed);
7528
} else {
7529
Opc = AArch64::FMLAv2f32;
7530
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7531
FMAInstKind::Accumulator);
7532
}
7533
break;
7534
case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7535
case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7536
RC = &AArch64::FPR64RegClass;
7537
if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7538
Opc = AArch64::FMLAv2i32_indexed;
7539
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7540
FMAInstKind::Indexed);
7541
} else {
7542
Opc = AArch64::FMLAv2f32;
7543
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7544
FMAInstKind::Accumulator);
7545
}
7546
break;
7547
7548
case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7549
RC = &AArch64::FPR128RegClass;
7550
Opc = AArch64::FMLAv8i16_indexed;
7551
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7552
FMAInstKind::Indexed);
7553
break;
7554
case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7555
RC = &AArch64::FPR128RegClass;
7556
Opc = AArch64::FMLAv8f16;
7557
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7558
FMAInstKind::Accumulator);
7559
break;
7560
case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7561
RC = &AArch64::FPR128RegClass;
7562
Opc = AArch64::FMLAv8i16_indexed;
7563
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7564
FMAInstKind::Indexed);
7565
break;
7566
case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7567
RC = &AArch64::FPR128RegClass;
7568
Opc = AArch64::FMLAv8f16;
7569
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7570
FMAInstKind::Accumulator);
7571
break;
7572
7573
case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7574
case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7575
RC = &AArch64::FPR128RegClass;
7576
if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7577
Opc = AArch64::FMLAv2i64_indexed;
7578
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7579
FMAInstKind::Indexed);
7580
} else {
7581
Opc = AArch64::FMLAv2f64;
7582
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7583
FMAInstKind::Accumulator);
7584
}
7585
break;
7586
case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7587
case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7588
RC = &AArch64::FPR128RegClass;
7589
if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7590
Opc = AArch64::FMLAv2i64_indexed;
7591
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7592
FMAInstKind::Indexed);
7593
} else {
7594
Opc = AArch64::FMLAv2f64;
7595
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7596
FMAInstKind::Accumulator);
7597
}
7598
break;
7599
7600
case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7601
case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7602
RC = &AArch64::FPR128RegClass;
7603
if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7604
Opc = AArch64::FMLAv4i32_indexed;
7605
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7606
FMAInstKind::Indexed);
7607
} else {
7608
Opc = AArch64::FMLAv4f32;
7609
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7610
FMAInstKind::Accumulator);
7611
}
7612
break;
7613
7614
case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7615
case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7616
RC = &AArch64::FPR128RegClass;
7617
if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7618
Opc = AArch64::FMLAv4i32_indexed;
7619
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7620
FMAInstKind::Indexed);
7621
} else {
7622
Opc = AArch64::FMLAv4f32;
7623
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7624
FMAInstKind::Accumulator);
7625
}
7626
break;
7627
7628
case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7629
Opc = AArch64::FNMSUBHrrr;
7630
RC = &AArch64::FPR16RegClass;
7631
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7632
break;
7633
case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7634
Opc = AArch64::FNMSUBSrrr;
7635
RC = &AArch64::FPR32RegClass;
7636
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7637
break;
7638
case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7639
Opc = AArch64::FNMSUBDrrr;
7640
RC = &AArch64::FPR64RegClass;
7641
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7642
break;
7643
7644
case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7645
Opc = AArch64::FNMADDHrrr;
7646
RC = &AArch64::FPR16RegClass;
7647
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7648
break;
7649
case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7650
Opc = AArch64::FNMADDSrrr;
7651
RC = &AArch64::FPR32RegClass;
7652
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7653
break;
7654
case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7655
Opc = AArch64::FNMADDDrrr;
7656
RC = &AArch64::FPR64RegClass;
7657
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7658
break;
7659
7660
case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7661
Opc = AArch64::FMSUBHrrr;
7662
RC = &AArch64::FPR16RegClass;
7663
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7664
break;
7665
case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7666
Opc = AArch64::FMSUBSrrr;
7667
RC = &AArch64::FPR32RegClass;
7668
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7669
break;
7670
case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7671
Opc = AArch64::FMSUBDrrr;
7672
RC = &AArch64::FPR64RegClass;
7673
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7674
break;
7675
7676
case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7677
Opc = AArch64::FMLSv1i32_indexed;
7678
RC = &AArch64::FPR32RegClass;
7679
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7680
FMAInstKind::Indexed);
7681
break;
7682
7683
case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7684
Opc = AArch64::FMLSv1i64_indexed;
7685
RC = &AArch64::FPR64RegClass;
7686
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7687
FMAInstKind::Indexed);
7688
break;
7689
7690
case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7691
case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7692
RC = &AArch64::FPR64RegClass;
7693
Register NewVR = MRI.createVirtualRegister(RC);
7694
MachineInstrBuilder MIB1 =
7695
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7696
.add(Root.getOperand(2));
7697
InsInstrs.push_back(MIB1);
7698
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7699
if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
7700
Opc = AArch64::FMLAv4f16;
7701
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7702
FMAInstKind::Accumulator, &NewVR);
7703
} else {
7704
Opc = AArch64::FMLAv4i16_indexed;
7705
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7706
FMAInstKind::Indexed, &NewVR);
7707
}
7708
break;
7709
}
7710
case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7711
RC = &AArch64::FPR64RegClass;
7712
Opc = AArch64::FMLSv4f16;
7713
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7714
FMAInstKind::Accumulator);
7715
break;
7716
case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7717
RC = &AArch64::FPR64RegClass;
7718
Opc = AArch64::FMLSv4i16_indexed;
7719
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7720
FMAInstKind::Indexed);
7721
break;
7722
7723
case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7724
case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7725
RC = &AArch64::FPR64RegClass;
7726
if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7727
Opc = AArch64::FMLSv2i32_indexed;
7728
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7729
FMAInstKind::Indexed);
7730
} else {
7731
Opc = AArch64::FMLSv2f32;
7732
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7733
FMAInstKind::Accumulator);
7734
}
7735
break;
7736
7737
case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7738
case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7739
RC = &AArch64::FPR128RegClass;
7740
Register NewVR = MRI.createVirtualRegister(RC);
7741
MachineInstrBuilder MIB1 =
7742
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7743
.add(Root.getOperand(2));
7744
InsInstrs.push_back(MIB1);
7745
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7746
if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
7747
Opc = AArch64::FMLAv8f16;
7748
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7749
FMAInstKind::Accumulator, &NewVR);
7750
} else {
7751
Opc = AArch64::FMLAv8i16_indexed;
7752
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7753
FMAInstKind::Indexed, &NewVR);
7754
}
7755
break;
7756
}
7757
case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7758
RC = &AArch64::FPR128RegClass;
7759
Opc = AArch64::FMLSv8f16;
7760
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7761
FMAInstKind::Accumulator);
7762
break;
7763
case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7764
RC = &AArch64::FPR128RegClass;
7765
Opc = AArch64::FMLSv8i16_indexed;
7766
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7767
FMAInstKind::Indexed);
7768
break;
7769
7770
case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7771
case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7772
RC = &AArch64::FPR128RegClass;
7773
if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7774
Opc = AArch64::FMLSv2i64_indexed;
7775
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7776
FMAInstKind::Indexed);
7777
} else {
7778
Opc = AArch64::FMLSv2f64;
7779
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7780
FMAInstKind::Accumulator);
7781
}
7782
break;
7783
7784
case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7785
case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7786
RC = &AArch64::FPR128RegClass;
7787
if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7788
Opc = AArch64::FMLSv4i32_indexed;
7789
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7790
FMAInstKind::Indexed);
7791
} else {
7792
Opc = AArch64::FMLSv4f32;
7793
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7794
FMAInstKind::Accumulator);
7795
}
7796
break;
7797
case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
7798
case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7799
RC = &AArch64::FPR64RegClass;
7800
Register NewVR = MRI.createVirtualRegister(RC);
7801
MachineInstrBuilder MIB1 =
7802
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7803
.add(Root.getOperand(2));
7804
InsInstrs.push_back(MIB1);
7805
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7806
if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7807
Opc = AArch64::FMLAv2i32_indexed;
7808
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7809
FMAInstKind::Indexed, &NewVR);
7810
} else {
7811
Opc = AArch64::FMLAv2f32;
7812
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7813
FMAInstKind::Accumulator, &NewVR);
7814
}
7815
break;
7816
}
7817
case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
7818
case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7819
RC = &AArch64::FPR128RegClass;
7820
Register NewVR = MRI.createVirtualRegister(RC);
7821
MachineInstrBuilder MIB1 =
7822
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7823
.add(Root.getOperand(2));
7824
InsInstrs.push_back(MIB1);
7825
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7826
if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7827
Opc = AArch64::FMLAv4i32_indexed;
7828
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7829
FMAInstKind::Indexed, &NewVR);
7830
} else {
7831
Opc = AArch64::FMLAv4f32;
7832
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7833
FMAInstKind::Accumulator, &NewVR);
7834
}
7835
break;
7836
}
7837
case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
7838
case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7839
RC = &AArch64::FPR128RegClass;
7840
Register NewVR = MRI.createVirtualRegister(RC);
7841
MachineInstrBuilder MIB1 =
7842
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7843
.add(Root.getOperand(2));
7844
InsInstrs.push_back(MIB1);
7845
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7846
if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7847
Opc = AArch64::FMLAv2i64_indexed;
7848
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7849
FMAInstKind::Indexed, &NewVR);
7850
} else {
7851
Opc = AArch64::FMLAv2f64;
7852
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7853
FMAInstKind::Accumulator, &NewVR);
7854
}
7855
break;
7856
}
7857
case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7858
case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7859
unsigned IdxDupOp =
7860
(Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
7861
: 2;
7862
genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7863
&AArch64::FPR128RegClass, MRI);
7864
break;
7865
}
7866
case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7867
case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7868
unsigned IdxDupOp =
7869
(Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
7870
: 2;
7871
genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7872
&AArch64::FPR128RegClass, MRI);
7873
break;
7874
}
7875
case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7876
case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7877
unsigned IdxDupOp =
7878
(Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
7879
: 2;
7880
genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7881
&AArch64::FPR128_loRegClass, MRI);
7882
break;
7883
}
7884
case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7885
case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7886
unsigned IdxDupOp =
7887
(Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
7888
: 2;
7889
genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7890
&AArch64::FPR128RegClass, MRI);
7891
break;
7892
}
7893
case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7894
case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7895
unsigned IdxDupOp =
7896
(Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
7897
: 2;
7898
genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7899
&AArch64::FPR128_loRegClass, MRI);
7900
break;
7901
}
7902
case AArch64MachineCombinerPattern::FNMADD: {
7903
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7904
break;
7905
}
7906
7907
} // end switch (Pattern)
7908
// Record MUL and ADD/SUB for deletion
7909
if (MUL)
7910
DelInstrs.push_back(MUL);
7911
DelInstrs.push_back(&Root);
7912
7913
// Set the flags on the inserted instructions to be the merged flags of the
7914
// instructions that we have combined.
7915
uint32_t Flags = Root.getFlags();
7916
if (MUL)
7917
Flags = Root.mergeFlagsWith(*MUL);
7918
for (auto *MI : InsInstrs)
7919
MI->setFlags(Flags);
7920
}
7921
7922
/// Replace csincr-branch sequence by simple conditional branch
7923
///
7924
/// Examples:
7925
/// 1. \code
7926
/// csinc w9, wzr, wzr, <condition code>
7927
/// tbnz w9, #0, 0x44
7928
/// \endcode
7929
/// to
7930
/// \code
7931
/// b.<inverted condition code>
7932
/// \endcode
7933
///
7934
/// 2. \code
7935
/// csinc w9, wzr, wzr, <condition code>
7936
/// tbz w9, #0, 0x44
7937
/// \endcode
7938
/// to
7939
/// \code
7940
/// b.<condition code>
7941
/// \endcode
7942
///
7943
/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7944
/// compare's constant operand is power of 2.
7945
///
7946
/// Examples:
7947
/// \code
7948
/// and w8, w8, #0x400
7949
/// cbnz w8, L1
7950
/// \endcode
7951
/// to
7952
/// \code
7953
/// tbnz w8, #10, L1
7954
/// \endcode
7955
///
7956
/// \param MI Conditional Branch
7957
/// \return True when the simple conditional branch is generated
7958
///
7959
bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7960
bool IsNegativeBranch = false;
7961
bool IsTestAndBranch = false;
7962
unsigned TargetBBInMI = 0;
7963
switch (MI.getOpcode()) {
7964
default:
7965
llvm_unreachable("Unknown branch instruction?");
7966
case AArch64::Bcc:
7967
return false;
7968
case AArch64::CBZW:
7969
case AArch64::CBZX:
7970
TargetBBInMI = 1;
7971
break;
7972
case AArch64::CBNZW:
7973
case AArch64::CBNZX:
7974
TargetBBInMI = 1;
7975
IsNegativeBranch = true;
7976
break;
7977
case AArch64::TBZW:
7978
case AArch64::TBZX:
7979
TargetBBInMI = 2;
7980
IsTestAndBranch = true;
7981
break;
7982
case AArch64::TBNZW:
7983
case AArch64::TBNZX:
7984
TargetBBInMI = 2;
7985
IsNegativeBranch = true;
7986
IsTestAndBranch = true;
7987
break;
7988
}
7989
// So we increment a zero register and test for bits other
7990
// than bit 0? Conservatively bail out in case the verifier
7991
// missed this case.
7992
if (IsTestAndBranch && MI.getOperand(1).getImm())
7993
return false;
7994
7995
// Find Definition.
7996
assert(MI.getParent() && "Incomplete machine instruciton\n");
7997
MachineBasicBlock *MBB = MI.getParent();
7998
MachineFunction *MF = MBB->getParent();
7999
MachineRegisterInfo *MRI = &MF->getRegInfo();
8000
Register VReg = MI.getOperand(0).getReg();
8001
if (!VReg.isVirtual())
8002
return false;
8003
8004
MachineInstr *DefMI = MRI->getVRegDef(VReg);
8005
8006
// Look through COPY instructions to find definition.
8007
while (DefMI->isCopy()) {
8008
Register CopyVReg = DefMI->getOperand(1).getReg();
8009
if (!MRI->hasOneNonDBGUse(CopyVReg))
8010
return false;
8011
if (!MRI->hasOneDef(CopyVReg))
8012
return false;
8013
DefMI = MRI->getVRegDef(CopyVReg);
8014
}
8015
8016
switch (DefMI->getOpcode()) {
8017
default:
8018
return false;
8019
// Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8020
case AArch64::ANDWri:
8021
case AArch64::ANDXri: {
8022
if (IsTestAndBranch)
8023
return false;
8024
if (DefMI->getParent() != MBB)
8025
return false;
8026
if (!MRI->hasOneNonDBGUse(VReg))
8027
return false;
8028
8029
bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8030
uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8031
DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8032
if (!isPowerOf2_64(Mask))
8033
return false;
8034
8035
MachineOperand &MO = DefMI->getOperand(1);
8036
Register NewReg = MO.getReg();
8037
if (!NewReg.isVirtual())
8038
return false;
8039
8040
assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8041
8042
MachineBasicBlock &RefToMBB = *MBB;
8043
MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8044
DebugLoc DL = MI.getDebugLoc();
8045
unsigned Imm = Log2_64(Mask);
8046
unsigned Opc = (Imm < 32)
8047
? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8048
: (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8049
MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8050
.addReg(NewReg)
8051
.addImm(Imm)
8052
.addMBB(TBB);
8053
// Register lives on to the CBZ now.
8054
MO.setIsKill(false);
8055
8056
// For immediate smaller than 32, we need to use the 32-bit
8057
// variant (W) in all cases. Indeed the 64-bit variant does not
8058
// allow to encode them.
8059
// Therefore, if the input register is 64-bit, we need to take the
8060
// 32-bit sub-part.
8061
if (!Is32Bit && Imm < 32)
8062
NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8063
MI.eraseFromParent();
8064
return true;
8065
}
8066
// Look for CSINC
8067
case AArch64::CSINCWr:
8068
case AArch64::CSINCXr: {
8069
if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8070
DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8071
!(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8072
DefMI->getOperand(2).getReg() == AArch64::XZR))
8073
return false;
8074
8075
if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8076
true) != -1)
8077
return false;
8078
8079
AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8080
// Convert only when the condition code is not modified between
8081
// the CSINC and the branch. The CC may be used by other
8082
// instructions in between.
8083
if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8084
return false;
8085
MachineBasicBlock &RefToMBB = *MBB;
8086
MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8087
DebugLoc DL = MI.getDebugLoc();
8088
if (IsNegativeBranch)
8089
CC = AArch64CC::getInvertedCondCode(CC);
8090
BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8091
MI.eraseFromParent();
8092
return true;
8093
}
8094
}
8095
}
8096
8097
std::pair<unsigned, unsigned>
8098
AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8099
const unsigned Mask = AArch64II::MO_FRAGMENT;
8100
return std::make_pair(TF & Mask, TF & ~Mask);
8101
}
8102
8103
ArrayRef<std::pair<unsigned, const char *>>
8104
AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8105
using namespace AArch64II;
8106
8107
static const std::pair<unsigned, const char *> TargetFlags[] = {
8108
{MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8109
{MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
8110
{MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
8111
{MO_HI12, "aarch64-hi12"}};
8112
return ArrayRef(TargetFlags);
8113
}
8114
8115
ArrayRef<std::pair<unsigned, const char *>>
8116
AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8117
using namespace AArch64II;
8118
8119
static const std::pair<unsigned, const char *> TargetFlags[] = {
8120
{MO_COFFSTUB, "aarch64-coffstub"},
8121
{MO_GOT, "aarch64-got"},
8122
{MO_NC, "aarch64-nc"},
8123
{MO_S, "aarch64-s"},
8124
{MO_TLS, "aarch64-tls"},
8125
{MO_DLLIMPORT, "aarch64-dllimport"},
8126
{MO_PREL, "aarch64-prel"},
8127
{MO_TAGGED, "aarch64-tagged"},
8128
{MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8129
};
8130
return ArrayRef(TargetFlags);
8131
}
8132
8133
ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8134
AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8135
static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8136
{{MOSuppressPair, "aarch64-suppress-pair"},
8137
{MOStridedAccess, "aarch64-strided-access"}};
8138
return ArrayRef(TargetFlags);
8139
}
8140
8141
/// Constants defining how certain sequences should be outlined.
8142
/// This encompasses how an outlined function should be called, and what kind of
8143
/// frame should be emitted for that outlined function.
8144
///
8145
/// \p MachineOutlinerDefault implies that the function should be called with
8146
/// a save and restore of LR to the stack.
8147
///
8148
/// That is,
8149
///
8150
/// I1 Save LR OUTLINED_FUNCTION:
8151
/// I2 --> BL OUTLINED_FUNCTION I1
8152
/// I3 Restore LR I2
8153
/// I3
8154
/// RET
8155
///
8156
/// * Call construction overhead: 3 (save + BL + restore)
8157
/// * Frame construction overhead: 1 (ret)
8158
/// * Requires stack fixups? Yes
8159
///
8160
/// \p MachineOutlinerTailCall implies that the function is being created from
8161
/// a sequence of instructions ending in a return.
8162
///
8163
/// That is,
8164
///
8165
/// I1 OUTLINED_FUNCTION:
8166
/// I2 --> B OUTLINED_FUNCTION I1
8167
/// RET I2
8168
/// RET
8169
///
8170
/// * Call construction overhead: 1 (B)
8171
/// * Frame construction overhead: 0 (Return included in sequence)
8172
/// * Requires stack fixups? No
8173
///
8174
/// \p MachineOutlinerNoLRSave implies that the function should be called using
8175
/// a BL instruction, but doesn't require LR to be saved and restored. This
8176
/// happens when LR is known to be dead.
8177
///
8178
/// That is,
8179
///
8180
/// I1 OUTLINED_FUNCTION:
8181
/// I2 --> BL OUTLINED_FUNCTION I1
8182
/// I3 I2
8183
/// I3
8184
/// RET
8185
///
8186
/// * Call construction overhead: 1 (BL)
8187
/// * Frame construction overhead: 1 (RET)
8188
/// * Requires stack fixups? No
8189
///
8190
/// \p MachineOutlinerThunk implies that the function is being created from
8191
/// a sequence of instructions ending in a call. The outlined function is
8192
/// called with a BL instruction, and the outlined function tail-calls the
8193
/// original call destination.
8194
///
8195
/// That is,
8196
///
8197
/// I1 OUTLINED_FUNCTION:
8198
/// I2 --> BL OUTLINED_FUNCTION I1
8199
/// BL f I2
8200
/// B f
8201
/// * Call construction overhead: 1 (BL)
8202
/// * Frame construction overhead: 0
8203
/// * Requires stack fixups? No
8204
///
8205
/// \p MachineOutlinerRegSave implies that the function should be called with a
8206
/// save and restore of LR to an available register. This allows us to avoid
8207
/// stack fixups. Note that this outlining variant is compatible with the
8208
/// NoLRSave case.
8209
///
8210
/// That is,
8211
///
8212
/// I1 Save LR OUTLINED_FUNCTION:
8213
/// I2 --> BL OUTLINED_FUNCTION I1
8214
/// I3 Restore LR I2
8215
/// I3
8216
/// RET
8217
///
8218
/// * Call construction overhead: 3 (save + BL + restore)
8219
/// * Frame construction overhead: 1 (ret)
8220
/// * Requires stack fixups? No
8221
enum MachineOutlinerClass {
8222
MachineOutlinerDefault, /// Emit a save, restore, call, and return.
8223
MachineOutlinerTailCall, /// Only emit a branch.
8224
MachineOutlinerNoLRSave, /// Emit a call and return.
8225
MachineOutlinerThunk, /// Emit a call and tail-call.
8226
MachineOutlinerRegSave /// Same as default, but save to a register.
8227
};
8228
8229
enum MachineOutlinerMBBFlags {
8230
LRUnavailableSomewhere = 0x2,
8231
HasCalls = 0x4,
8232
UnsafeRegsDead = 0x8
8233
};
8234
8235
Register
8236
AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8237
MachineFunction *MF = C.getMF();
8238
const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8239
const AArch64RegisterInfo *ARI =
8240
static_cast<const AArch64RegisterInfo *>(&TRI);
8241
// Check if there is an available register across the sequence that we can
8242
// use.
8243
for (unsigned Reg : AArch64::GPR64RegClass) {
8244
if (!ARI->isReservedReg(*MF, Reg) &&
8245
Reg != AArch64::LR && // LR is not reserved, but don't use it.
8246
Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8247
Reg != AArch64::X17 && // Ditto for X17.
8248
C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8249
C.isAvailableInsideSeq(Reg, TRI))
8250
return Reg;
8251
}
8252
return Register();
8253
}
8254
8255
static bool
8256
outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8257
const outliner::Candidate &b) {
8258
const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8259
const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8260
8261
return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8262
MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8263
}
8264
8265
static bool
8266
outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8267
const outliner::Candidate &b) {
8268
const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8269
const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8270
8271
return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8272
}
8273
8274
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8275
const outliner::Candidate &b) {
8276
const AArch64Subtarget &SubtargetA =
8277
a.getMF()->getSubtarget<AArch64Subtarget>();
8278
const AArch64Subtarget &SubtargetB =
8279
b.getMF()->getSubtarget<AArch64Subtarget>();
8280
return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8281
}
8282
8283
std::optional<outliner::OutlinedFunction>
8284
AArch64InstrInfo::getOutliningCandidateInfo(
8285
std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8286
unsigned SequenceSize = 0;
8287
for (auto &MI : RepeatedSequenceLocs[0])
8288
SequenceSize += getInstSizeInBytes(MI);
8289
8290
unsigned NumBytesToCreateFrame = 0;
8291
8292
// We only allow outlining for functions having exactly matching return
8293
// address signing attributes, i.e., all share the same value for the
8294
// attribute "sign-return-address" and all share the same type of key they
8295
// are signed with.
8296
// Additionally we require all functions to simultaniously either support
8297
// v8.3a features or not. Otherwise an outlined function could get signed
8298
// using dedicated v8.3 instructions and a call from a function that doesn't
8299
// support v8.3 instructions would therefore be invalid.
8300
if (std::adjacent_find(
8301
RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8302
[](const outliner::Candidate &a, const outliner::Candidate &b) {
8303
// Return true if a and b are non-equal w.r.t. return address
8304
// signing or support of v8.3a features
8305
if (outliningCandidatesSigningScopeConsensus(a, b) &&
8306
outliningCandidatesSigningKeyConsensus(a, b) &&
8307
outliningCandidatesV8_3OpsConsensus(a, b)) {
8308
return false;
8309
}
8310
return true;
8311
}) != RepeatedSequenceLocs.end()) {
8312
return std::nullopt;
8313
}
8314
8315
// Since at this point all candidates agree on their return address signing
8316
// picking just one is fine. If the candidate functions potentially sign their
8317
// return addresses, the outlined function should do the same. Note that in
8318
// the case of "sign-return-address"="non-leaf" this is an assumption: It is
8319
// not certainly true that the outlined function will have to sign its return
8320
// address but this decision is made later, when the decision to outline
8321
// has already been made.
8322
// The same holds for the number of additional instructions we need: On
8323
// v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8324
// necessary. However, at this point we don't know if the outlined function
8325
// will have a RET instruction so we assume the worst.
8326
const TargetRegisterInfo &TRI = getRegisterInfo();
8327
// Performing a tail call may require extra checks when PAuth is enabled.
8328
// If PAuth is disabled, set it to zero for uniformity.
8329
unsigned NumBytesToCheckLRInTCEpilogue = 0;
8330
if (RepeatedSequenceLocs[0]
8331
.getMF()
8332
->getInfo<AArch64FunctionInfo>()
8333
->shouldSignReturnAddress(true)) {
8334
// One PAC and one AUT instructions
8335
NumBytesToCreateFrame += 8;
8336
8337
// PAuth is enabled - set extra tail call cost, if any.
8338
auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
8339
*RepeatedSequenceLocs[0].getMF());
8340
NumBytesToCheckLRInTCEpilogue =
8341
AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8342
// Checking the authenticated LR value may significantly impact
8343
// SequenceSize, so account for it for more precise results.
8344
if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8345
SequenceSize += NumBytesToCheckLRInTCEpilogue;
8346
8347
// We have to check if sp modifying instructions would get outlined.
8348
// If so we only allow outlining if sp is unchanged overall, so matching
8349
// sub and add instructions are okay to outline, all other sp modifications
8350
// are not
8351
auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8352
int SPValue = 0;
8353
for (auto &MI : C) {
8354
if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8355
switch (MI.getOpcode()) {
8356
case AArch64::ADDXri:
8357
case AArch64::ADDWri:
8358
assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8359
assert(MI.getOperand(2).isImm() &&
8360
"Expected operand to be immediate");
8361
assert(MI.getOperand(1).isReg() &&
8362
"Expected operand to be a register");
8363
// Check if the add just increments sp. If so, we search for
8364
// matching sub instructions that decrement sp. If not, the
8365
// modification is illegal
8366
if (MI.getOperand(1).getReg() == AArch64::SP)
8367
SPValue += MI.getOperand(2).getImm();
8368
else
8369
return true;
8370
break;
8371
case AArch64::SUBXri:
8372
case AArch64::SUBWri:
8373
assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8374
assert(MI.getOperand(2).isImm() &&
8375
"Expected operand to be immediate");
8376
assert(MI.getOperand(1).isReg() &&
8377
"Expected operand to be a register");
8378
// Check if the sub just decrements sp. If so, we search for
8379
// matching add instructions that increment sp. If not, the
8380
// modification is illegal
8381
if (MI.getOperand(1).getReg() == AArch64::SP)
8382
SPValue -= MI.getOperand(2).getImm();
8383
else
8384
return true;
8385
break;
8386
default:
8387
return true;
8388
}
8389
}
8390
}
8391
if (SPValue)
8392
return true;
8393
return false;
8394
};
8395
// Remove candidates with illegal stack modifying instructions
8396
llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8397
8398
// If the sequence doesn't have enough candidates left, then we're done.
8399
if (RepeatedSequenceLocs.size() < 2)
8400
return std::nullopt;
8401
}
8402
8403
// Properties about candidate MBBs that hold for all of them.
8404
unsigned FlagsSetInAll = 0xF;
8405
8406
// Compute liveness information for each candidate, and set FlagsSetInAll.
8407
for (outliner::Candidate &C : RepeatedSequenceLocs)
8408
FlagsSetInAll &= C.Flags;
8409
8410
unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8411
8412
// Helper lambda which sets call information for every candidate.
8413
auto SetCandidateCallInfo =
8414
[&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8415
for (outliner::Candidate &C : RepeatedSequenceLocs)
8416
C.setCallInfo(CallID, NumBytesForCall);
8417
};
8418
8419
unsigned FrameID = MachineOutlinerDefault;
8420
NumBytesToCreateFrame += 4;
8421
8422
bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8423
return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8424
});
8425
8426
// We check to see if CFI Instructions are present, and if they are
8427
// we find the number of CFI Instructions in the candidates.
8428
unsigned CFICount = 0;
8429
for (auto &I : RepeatedSequenceLocs[0]) {
8430
if (I.isCFIInstruction())
8431
CFICount++;
8432
}
8433
8434
// We compare the number of found CFI Instructions to the number of CFI
8435
// instructions in the parent function for each candidate. We must check this
8436
// since if we outline one of the CFI instructions in a function, we have to
8437
// outline them all for correctness. If we do not, the address offsets will be
8438
// incorrect between the two sections of the program.
8439
for (outliner::Candidate &C : RepeatedSequenceLocs) {
8440
std::vector<MCCFIInstruction> CFIInstructions =
8441
C.getMF()->getFrameInstructions();
8442
8443
if (CFICount > 0 && CFICount != CFIInstructions.size())
8444
return std::nullopt;
8445
}
8446
8447
// Returns true if an instructions is safe to fix up, false otherwise.
8448
auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8449
if (MI.isCall())
8450
return true;
8451
8452
if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8453
!MI.readsRegister(AArch64::SP, &TRI))
8454
return true;
8455
8456
// Any modification of SP will break our code to save/restore LR.
8457
// FIXME: We could handle some instructions which add a constant
8458
// offset to SP, with a bit more work.
8459
if (MI.modifiesRegister(AArch64::SP, &TRI))
8460
return false;
8461
8462
// At this point, we have a stack instruction that we might need to
8463
// fix up. We'll handle it if it's a load or store.
8464
if (MI.mayLoadOrStore()) {
8465
const MachineOperand *Base; // Filled with the base operand of MI.
8466
int64_t Offset; // Filled with the offset of MI.
8467
bool OffsetIsScalable;
8468
8469
// Does it allow us to offset the base operand and is the base the
8470
// register SP?
8471
if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8472
!Base->isReg() || Base->getReg() != AArch64::SP)
8473
return false;
8474
8475
// Fixe-up code below assumes bytes.
8476
if (OffsetIsScalable)
8477
return false;
8478
8479
// Find the minimum/maximum offset for this instruction and check
8480
// if fixing it up would be in range.
8481
int64_t MinOffset,
8482
MaxOffset; // Unscaled offsets for the instruction.
8483
// The scale to multiply the offsets by.
8484
TypeSize Scale(0U, false), DummyWidth(0U, false);
8485
getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8486
8487
Offset += 16; // Update the offset to what it would be if we outlined.
8488
if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8489
Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8490
return false;
8491
8492
// It's in range, so we can outline it.
8493
return true;
8494
}
8495
8496
// FIXME: Add handling for instructions like "add x0, sp, #8".
8497
8498
// We can't fix it up, so don't outline it.
8499
return false;
8500
};
8501
8502
// True if it's possible to fix up each stack instruction in this sequence.
8503
// Important for frames/call variants that modify the stack.
8504
bool AllStackInstrsSafe =
8505
llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
8506
8507
// If the last instruction in any candidate is a terminator, then we should
8508
// tail call all of the candidates.
8509
if (RepeatedSequenceLocs[0].back().isTerminator()) {
8510
FrameID = MachineOutlinerTailCall;
8511
NumBytesToCreateFrame = 0;
8512
unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8513
SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8514
}
8515
8516
else if (LastInstrOpcode == AArch64::BL ||
8517
((LastInstrOpcode == AArch64::BLR ||
8518
LastInstrOpcode == AArch64::BLRNoIP) &&
8519
!HasBTI)) {
8520
// FIXME: Do we need to check if the code after this uses the value of LR?
8521
FrameID = MachineOutlinerThunk;
8522
NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8523
SetCandidateCallInfo(MachineOutlinerThunk, 4);
8524
}
8525
8526
else {
8527
// We need to decide how to emit calls + frames. We can always emit the same
8528
// frame if we don't need to save to the stack. If we have to save to the
8529
// stack, then we need a different frame.
8530
unsigned NumBytesNoStackCalls = 0;
8531
std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8532
8533
// Check if we have to save LR.
8534
for (outliner::Candidate &C : RepeatedSequenceLocs) {
8535
bool LRAvailable =
8536
(C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8537
? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8538
: true;
8539
// If we have a noreturn caller, then we're going to be conservative and
8540
// say that we have to save LR. If we don't have a ret at the end of the
8541
// block, then we can't reason about liveness accurately.
8542
//
8543
// FIXME: We can probably do better than always disabling this in
8544
// noreturn functions by fixing up the liveness info.
8545
bool IsNoReturn =
8546
C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8547
8548
// Is LR available? If so, we don't need a save.
8549
if (LRAvailable && !IsNoReturn) {
8550
NumBytesNoStackCalls += 4;
8551
C.setCallInfo(MachineOutlinerNoLRSave, 4);
8552
CandidatesWithoutStackFixups.push_back(C);
8553
}
8554
8555
// Is an unused register available? If so, we won't modify the stack, so
8556
// we can outline with the same frame type as those that don't save LR.
8557
else if (findRegisterToSaveLRTo(C)) {
8558
NumBytesNoStackCalls += 12;
8559
C.setCallInfo(MachineOutlinerRegSave, 12);
8560
CandidatesWithoutStackFixups.push_back(C);
8561
}
8562
8563
// Is SP used in the sequence at all? If not, we don't have to modify
8564
// the stack, so we are guaranteed to get the same frame.
8565
else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8566
NumBytesNoStackCalls += 12;
8567
C.setCallInfo(MachineOutlinerDefault, 12);
8568
CandidatesWithoutStackFixups.push_back(C);
8569
}
8570
8571
// If we outline this, we need to modify the stack. Pretend we don't
8572
// outline this by saving all of its bytes.
8573
else {
8574
NumBytesNoStackCalls += SequenceSize;
8575
}
8576
}
8577
8578
// If there are no places where we have to save LR, then note that we
8579
// don't have to update the stack. Otherwise, give every candidate the
8580
// default call type, as long as it's safe to do so.
8581
if (!AllStackInstrsSafe ||
8582
NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8583
RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8584
FrameID = MachineOutlinerNoLRSave;
8585
if (RepeatedSequenceLocs.size() < 2)
8586
return std::nullopt;
8587
} else {
8588
SetCandidateCallInfo(MachineOutlinerDefault, 12);
8589
8590
// Bugzilla ID: 46767
8591
// TODO: Check if fixing up the stack more than once is safe so we can
8592
// outline these.
8593
//
8594
// An outline resulting in a caller that requires stack fixups at the
8595
// callsite to a callee that also requires stack fixups can happen when
8596
// there are no available registers at the candidate callsite for a
8597
// candidate that itself also has calls.
8598
//
8599
// In other words if function_containing_sequence in the following pseudo
8600
// assembly requires that we save LR at the point of the call, but there
8601
// are no available registers: in this case we save using SP and as a
8602
// result the SP offsets requires stack fixups by multiples of 16.
8603
//
8604
// function_containing_sequence:
8605
// ...
8606
// save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8607
// call OUTLINED_FUNCTION_N
8608
// restore LR from SP
8609
// ...
8610
//
8611
// OUTLINED_FUNCTION_N:
8612
// save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8613
// ...
8614
// bl foo
8615
// restore LR from SP
8616
// ret
8617
//
8618
// Because the code to handle more than one stack fixup does not
8619
// currently have the proper checks for legality, these cases will assert
8620
// in the AArch64 MachineOutliner. This is because the code to do this
8621
// needs more hardening, testing, better checks that generated code is
8622
// legal, etc and because it is only verified to handle a single pass of
8623
// stack fixup.
8624
//
8625
// The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8626
// these cases until they are known to be handled. Bugzilla 46767 is
8627
// referenced in comments at the assert site.
8628
//
8629
// To avoid asserting (or generating non-legal code on noassert builds)
8630
// we remove all candidates which would need more than one stack fixup by
8631
// pruning the cases where the candidate has calls while also having no
8632
// available LR and having no available general purpose registers to copy
8633
// LR to (ie one extra stack save/restore).
8634
//
8635
if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8636
erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
8637
auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8638
return (llvm::any_of(C, IsCall)) &&
8639
(!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8640
!findRegisterToSaveLRTo(C));
8641
});
8642
}
8643
}
8644
8645
// If we dropped all of the candidates, bail out here.
8646
if (RepeatedSequenceLocs.size() < 2) {
8647
RepeatedSequenceLocs.clear();
8648
return std::nullopt;
8649
}
8650
}
8651
8652
// Does every candidate's MBB contain a call? If so, then we might have a call
8653
// in the range.
8654
if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8655
// Check if the range contains a call. These require a save + restore of the
8656
// link register.
8657
outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8658
bool ModStackToSaveLR = false;
8659
if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
8660
[](const MachineInstr &MI) { return MI.isCall(); }))
8661
ModStackToSaveLR = true;
8662
8663
// Handle the last instruction separately. If this is a tail call, then the
8664
// last instruction is a call. We don't want to save + restore in this case.
8665
// However, it could be possible that the last instruction is a call without
8666
// it being valid to tail call this sequence. We should consider this as
8667
// well.
8668
else if (FrameID != MachineOutlinerThunk &&
8669
FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8670
ModStackToSaveLR = true;
8671
8672
if (ModStackToSaveLR) {
8673
// We can't fix up the stack. Bail out.
8674
if (!AllStackInstrsSafe) {
8675
RepeatedSequenceLocs.clear();
8676
return std::nullopt;
8677
}
8678
8679
// Save + restore LR.
8680
NumBytesToCreateFrame += 8;
8681
}
8682
}
8683
8684
// If we have CFI instructions, we can only outline if the outlined section
8685
// can be a tail call
8686
if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8687
return std::nullopt;
8688
8689
return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8690
NumBytesToCreateFrame, FrameID);
8691
}
8692
8693
void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8694
Function &F, std::vector<outliner::Candidate> &Candidates) const {
8695
// If a bunch of candidates reach this point they must agree on their return
8696
// address signing. It is therefore enough to just consider the signing
8697
// behaviour of one of them
8698
const auto &CFn = Candidates.front().getMF()->getFunction();
8699
8700
if (CFn.hasFnAttribute("ptrauth-returns"))
8701
F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
8702
if (CFn.hasFnAttribute("ptrauth-auth-traps"))
8703
F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
8704
// Since all candidates belong to the same module, just copy the
8705
// function-level attributes of an arbitrary function.
8706
if (CFn.hasFnAttribute("sign-return-address"))
8707
F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
8708
if (CFn.hasFnAttribute("sign-return-address-key"))
8709
F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
8710
8711
AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8712
}
8713
8714
bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8715
MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8716
const Function &F = MF.getFunction();
8717
8718
// Can F be deduplicated by the linker? If it can, don't outline from it.
8719
if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8720
return false;
8721
8722
// Don't outline from functions with section markings; the program could
8723
// expect that all the code is in the named section.
8724
// FIXME: Allow outlining from multiple functions with the same section
8725
// marking.
8726
if (F.hasSection())
8727
return false;
8728
8729
// Outlining from functions with redzones is unsafe since the outliner may
8730
// modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8731
// outline from it.
8732
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8733
if (!AFI || AFI->hasRedZone().value_or(true))
8734
return false;
8735
8736
// FIXME: Determine whether it is safe to outline from functions which contain
8737
// streaming-mode changes. We may need to ensure any smstart/smstop pairs are
8738
// outlined together and ensure it is safe to outline with async unwind info,
8739
// required for saving & restoring VG around calls.
8740
if (AFI->hasStreamingModeChanges())
8741
return false;
8742
8743
// FIXME: Teach the outliner to generate/handle Windows unwind info.
8744
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8745
return false;
8746
8747
// It's safe to outline from MF.
8748
return true;
8749
}
8750
8751
SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8752
AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8753
unsigned &Flags) const {
8754
assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8755
"Must track liveness!");
8756
SmallVector<
8757
std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8758
Ranges;
8759
// According to the AArch64 Procedure Call Standard, the following are
8760
// undefined on entry/exit from a function call:
8761
//
8762
// * Registers x16, x17, (and thus w16, w17)
8763
// * Condition codes (and thus the NZCV register)
8764
//
8765
// If any of these registers are used inside or live across an outlined
8766
// function, then they may be modified later, either by the compiler or
8767
// some other tool (like the linker).
8768
//
8769
// To avoid outlining in these situations, partition each block into ranges
8770
// where these registers are dead. We will only outline from those ranges.
8771
LiveRegUnits LRU(getRegisterInfo());
8772
auto AreAllUnsafeRegsDead = [&LRU]() {
8773
return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8774
LRU.available(AArch64::NZCV);
8775
};
8776
8777
// We need to know if LR is live across an outlining boundary later on in
8778
// order to decide how we'll create the outlined call, frame, etc.
8779
//
8780
// It's pretty expensive to check this for *every candidate* within a block.
8781
// That's some potentially n^2 behaviour, since in the worst case, we'd need
8782
// to compute liveness from the end of the block for O(n) candidates within
8783
// the block.
8784
//
8785
// So, to improve the average case, let's keep track of liveness from the end
8786
// of the block to the beginning of *every outlinable range*. If we know that
8787
// LR is available in every range we could outline from, then we know that
8788
// we don't need to check liveness for any candidate within that range.
8789
bool LRAvailableEverywhere = true;
8790
// Compute liveness bottom-up.
8791
LRU.addLiveOuts(MBB);
8792
// Update flags that require info about the entire MBB.
8793
auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8794
if (MI.isCall() && !MI.isTerminator())
8795
Flags |= MachineOutlinerMBBFlags::HasCalls;
8796
};
8797
// Range: [RangeBegin, RangeEnd)
8798
MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8799
unsigned RangeLen;
8800
auto CreateNewRangeStartingAt =
8801
[&RangeBegin, &RangeEnd,
8802
&RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8803
RangeBegin = NewBegin;
8804
RangeEnd = std::next(RangeBegin);
8805
RangeLen = 0;
8806
};
8807
auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8808
// At least one unsafe register is not dead. We do not want to outline at
8809
// this point. If it is long enough to outline from, save the range
8810
// [RangeBegin, RangeEnd).
8811
if (RangeLen > 1)
8812
Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
8813
};
8814
// Find the first point where all unsafe registers are dead.
8815
// FIND: <safe instr> <-- end of first potential range
8816
// SKIP: <unsafe def>
8817
// SKIP: ... everything between ...
8818
// SKIP: <unsafe use>
8819
auto FirstPossibleEndPt = MBB.instr_rbegin();
8820
for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8821
LRU.stepBackward(*FirstPossibleEndPt);
8822
// Update flags that impact how we outline across the entire block,
8823
// regardless of safety.
8824
UpdateWholeMBBFlags(*FirstPossibleEndPt);
8825
if (AreAllUnsafeRegsDead())
8826
break;
8827
}
8828
// If we exhausted the entire block, we have no safe ranges to outline.
8829
if (FirstPossibleEndPt == MBB.instr_rend())
8830
return Ranges;
8831
// Current range.
8832
CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8833
// StartPt points to the first place where all unsafe registers
8834
// are dead (if there is any such point). Begin partitioning the MBB into
8835
// ranges.
8836
for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
8837
LRU.stepBackward(MI);
8838
UpdateWholeMBBFlags(MI);
8839
if (!AreAllUnsafeRegsDead()) {
8840
SaveRangeIfNonEmpty();
8841
CreateNewRangeStartingAt(MI.getIterator());
8842
continue;
8843
}
8844
LRAvailableEverywhere &= LRU.available(AArch64::LR);
8845
RangeBegin = MI.getIterator();
8846
++RangeLen;
8847
}
8848
// Above loop misses the last (or only) range. If we are still safe, then
8849
// let's save the range.
8850
if (AreAllUnsafeRegsDead())
8851
SaveRangeIfNonEmpty();
8852
if (Ranges.empty())
8853
return Ranges;
8854
// We found the ranges bottom-up. Mapping expects the top-down. Reverse
8855
// the order.
8856
std::reverse(Ranges.begin(), Ranges.end());
8857
// If there is at least one outlinable range where LR is unavailable
8858
// somewhere, remember that.
8859
if (!LRAvailableEverywhere)
8860
Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8861
return Ranges;
8862
}
8863
8864
outliner::InstrType
8865
AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8866
unsigned Flags) const {
8867
MachineInstr &MI = *MIT;
8868
MachineBasicBlock *MBB = MI.getParent();
8869
MachineFunction *MF = MBB->getParent();
8870
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8871
8872
// Don't outline anything used for return address signing. The outlined
8873
// function will get signed later if needed
8874
switch (MI.getOpcode()) {
8875
case AArch64::PACM:
8876
case AArch64::PACIASP:
8877
case AArch64::PACIBSP:
8878
case AArch64::PACIASPPC:
8879
case AArch64::PACIBSPPC:
8880
case AArch64::AUTIASP:
8881
case AArch64::AUTIBSP:
8882
case AArch64::AUTIASPPCi:
8883
case AArch64::AUTIASPPCr:
8884
case AArch64::AUTIBSPPCi:
8885
case AArch64::AUTIBSPPCr:
8886
case AArch64::RETAA:
8887
case AArch64::RETAB:
8888
case AArch64::RETAASPPCi:
8889
case AArch64::RETAASPPCr:
8890
case AArch64::RETABSPPCi:
8891
case AArch64::RETABSPPCr:
8892
case AArch64::EMITBKEY:
8893
case AArch64::PAUTH_PROLOGUE:
8894
case AArch64::PAUTH_EPILOGUE:
8895
return outliner::InstrType::Illegal;
8896
}
8897
8898
// Don't outline LOHs.
8899
if (FuncInfo->getLOHRelated().count(&MI))
8900
return outliner::InstrType::Illegal;
8901
8902
// We can only outline these if we will tail call the outlined function, or
8903
// fix up the CFI offsets. Currently, CFI instructions are outlined only if
8904
// in a tail call.
8905
//
8906
// FIXME: If the proper fixups for the offset are implemented, this should be
8907
// possible.
8908
if (MI.isCFIInstruction())
8909
return outliner::InstrType::Legal;
8910
8911
// Is this a terminator for a basic block?
8912
if (MI.isTerminator())
8913
// TargetInstrInfo::getOutliningType has already filtered out anything
8914
// that would break this, so we can allow it here.
8915
return outliner::InstrType::Legal;
8916
8917
// Make sure none of the operands are un-outlinable.
8918
for (const MachineOperand &MOP : MI.operands()) {
8919
// A check preventing CFI indices was here before, but only CFI
8920
// instructions should have those.
8921
assert(!MOP.isCFIIndex());
8922
8923
// If it uses LR or W30 explicitly, then don't touch it.
8924
if (MOP.isReg() && !MOP.isImplicit() &&
8925
(MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8926
return outliner::InstrType::Illegal;
8927
}
8928
8929
// Special cases for instructions that can always be outlined, but will fail
8930
// the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8931
// be outlined because they don't require a *specific* value to be in LR.
8932
if (MI.getOpcode() == AArch64::ADRP)
8933
return outliner::InstrType::Legal;
8934
8935
// If MI is a call we might be able to outline it. We don't want to outline
8936
// any calls that rely on the position of items on the stack. When we outline
8937
// something containing a call, we have to emit a save and restore of LR in
8938
// the outlined function. Currently, this always happens by saving LR to the
8939
// stack. Thus, if we outline, say, half the parameters for a function call
8940
// plus the call, then we'll break the callee's expectations for the layout
8941
// of the stack.
8942
//
8943
// FIXME: Allow calls to functions which construct a stack frame, as long
8944
// as they don't access arguments on the stack.
8945
// FIXME: Figure out some way to analyze functions defined in other modules.
8946
// We should be able to compute the memory usage based on the IR calling
8947
// convention, even if we can't see the definition.
8948
if (MI.isCall()) {
8949
// Get the function associated with the call. Look at each operand and find
8950
// the one that represents the callee and get its name.
8951
const Function *Callee = nullptr;
8952
for (const MachineOperand &MOP : MI.operands()) {
8953
if (MOP.isGlobal()) {
8954
Callee = dyn_cast<Function>(MOP.getGlobal());
8955
break;
8956
}
8957
}
8958
8959
// Never outline calls to mcount. There isn't any rule that would require
8960
// this, but the Linux kernel's "ftrace" feature depends on it.
8961
if (Callee && Callee->getName() == "\01_mcount")
8962
return outliner::InstrType::Illegal;
8963
8964
// If we don't know anything about the callee, assume it depends on the
8965
// stack layout of the caller. In that case, it's only legal to outline
8966
// as a tail-call. Explicitly list the call instructions we know about so we
8967
// don't get unexpected results with call pseudo-instructions.
8968
auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8969
if (MI.getOpcode() == AArch64::BLR ||
8970
MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8971
UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8972
8973
if (!Callee)
8974
return UnknownCallOutlineType;
8975
8976
// We have a function we have information about. Check it if it's something
8977
// can safely outline.
8978
MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
8979
8980
// We don't know what's going on with the callee at all. Don't touch it.
8981
if (!CalleeMF)
8982
return UnknownCallOutlineType;
8983
8984
// Check if we know anything about the callee saves on the function. If we
8985
// don't, then don't touch it, since that implies that we haven't
8986
// computed anything about its stack frame yet.
8987
MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8988
if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8989
MFI.getNumObjects() > 0)
8990
return UnknownCallOutlineType;
8991
8992
// At this point, we can say that CalleeMF ought to not pass anything on the
8993
// stack. Therefore, we can outline it.
8994
return outliner::InstrType::Legal;
8995
}
8996
8997
// Don't touch the link register or W30.
8998
if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8999
MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9000
return outliner::InstrType::Illegal;
9001
9002
// Don't outline BTI instructions, because that will prevent the outlining
9003
// site from being indirectly callable.
9004
if (hasBTISemantics(MI))
9005
return outliner::InstrType::Illegal;
9006
9007
return outliner::InstrType::Legal;
9008
}
9009
9010
void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9011
for (MachineInstr &MI : MBB) {
9012
const MachineOperand *Base;
9013
TypeSize Width(0, false);
9014
int64_t Offset;
9015
bool OffsetIsScalable;
9016
9017
// Is this a load or store with an immediate offset with SP as the base?
9018
if (!MI.mayLoadOrStore() ||
9019
!getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9020
&RI) ||
9021
(Base->isReg() && Base->getReg() != AArch64::SP))
9022
continue;
9023
9024
// It is, so we have to fix it up.
9025
TypeSize Scale(0U, false);
9026
int64_t Dummy1, Dummy2;
9027
9028
MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9029
assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9030
getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9031
assert(Scale != 0 && "Unexpected opcode!");
9032
assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9033
9034
// We've pushed the return address to the stack, so add 16 to the offset.
9035
// This is safe, since we already checked if it would overflow when we
9036
// checked if this instruction was legal to outline.
9037
int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9038
StackOffsetOperand.setImm(NewImm);
9039
}
9040
}
9041
9042
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9043
const AArch64InstrInfo *TII,
9044
bool ShouldSignReturnAddr) {
9045
if (!ShouldSignReturnAddr)
9046
return;
9047
9048
BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9049
.setMIFlag(MachineInstr::FrameSetup);
9050
BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9051
TII->get(AArch64::PAUTH_EPILOGUE))
9052
.setMIFlag(MachineInstr::FrameDestroy);
9053
}
9054
9055
void AArch64InstrInfo::buildOutlinedFrame(
9056
MachineBasicBlock &MBB, MachineFunction &MF,
9057
const outliner::OutlinedFunction &OF) const {
9058
9059
AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9060
9061
if (OF.FrameConstructionID == MachineOutlinerTailCall)
9062
FI->setOutliningStyle("Tail Call");
9063
else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9064
// For thunk outlining, rewrite the last instruction from a call to a
9065
// tail-call.
9066
MachineInstr *Call = &*--MBB.instr_end();
9067
unsigned TailOpcode;
9068
if (Call->getOpcode() == AArch64::BL) {
9069
TailOpcode = AArch64::TCRETURNdi;
9070
} else {
9071
assert(Call->getOpcode() == AArch64::BLR ||
9072
Call->getOpcode() == AArch64::BLRNoIP);
9073
TailOpcode = AArch64::TCRETURNriALL;
9074
}
9075
MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9076
.add(Call->getOperand(0))
9077
.addImm(0);
9078
MBB.insert(MBB.end(), TC);
9079
Call->eraseFromParent();
9080
9081
FI->setOutliningStyle("Thunk");
9082
}
9083
9084
bool IsLeafFunction = true;
9085
9086
// Is there a call in the outlined range?
9087
auto IsNonTailCall = [](const MachineInstr &MI) {
9088
return MI.isCall() && !MI.isReturn();
9089
};
9090
9091
if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9092
// Fix up the instructions in the range, since we're going to modify the
9093
// stack.
9094
9095
// Bugzilla ID: 46767
9096
// TODO: Check if fixing up twice is safe so we can outline these.
9097
assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9098
"Can only fix up stack references once");
9099
fixupPostOutline(MBB);
9100
9101
IsLeafFunction = false;
9102
9103
// LR has to be a live in so that we can save it.
9104
if (!MBB.isLiveIn(AArch64::LR))
9105
MBB.addLiveIn(AArch64::LR);
9106
9107
MachineBasicBlock::iterator It = MBB.begin();
9108
MachineBasicBlock::iterator Et = MBB.end();
9109
9110
if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9111
OF.FrameConstructionID == MachineOutlinerThunk)
9112
Et = std::prev(MBB.end());
9113
9114
// Insert a save before the outlined region
9115
MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9116
.addReg(AArch64::SP, RegState::Define)
9117
.addReg(AArch64::LR)
9118
.addReg(AArch64::SP)
9119
.addImm(-16);
9120
It = MBB.insert(It, STRXpre);
9121
9122
if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9123
const TargetSubtargetInfo &STI = MF.getSubtarget();
9124
const MCRegisterInfo *MRI = STI.getRegisterInfo();
9125
unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9126
9127
// Add a CFI saying the stack was moved 16 B down.
9128
int64_t StackPosEntry =
9129
MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9130
BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9131
.addCFIIndex(StackPosEntry)
9132
.setMIFlags(MachineInstr::FrameSetup);
9133
9134
// Add a CFI saying that the LR that we want to find is now 16 B higher
9135
// than before.
9136
int64_t LRPosEntry = MF.addFrameInst(
9137
MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9138
BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9139
.addCFIIndex(LRPosEntry)
9140
.setMIFlags(MachineInstr::FrameSetup);
9141
}
9142
9143
// Insert a restore before the terminator for the function.
9144
MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9145
.addReg(AArch64::SP, RegState::Define)
9146
.addReg(AArch64::LR, RegState::Define)
9147
.addReg(AArch64::SP)
9148
.addImm(16);
9149
Et = MBB.insert(Et, LDRXpost);
9150
}
9151
9152
bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9153
9154
// If this is a tail call outlined function, then there's already a return.
9155
if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9156
OF.FrameConstructionID == MachineOutlinerThunk) {
9157
signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9158
return;
9159
}
9160
9161
// It's not a tail call, so we have to insert the return ourselves.
9162
9163
// LR has to be a live in so that we can return to it.
9164
if (!MBB.isLiveIn(AArch64::LR))
9165
MBB.addLiveIn(AArch64::LR);
9166
9167
MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9168
.addReg(AArch64::LR);
9169
MBB.insert(MBB.end(), ret);
9170
9171
signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9172
9173
FI->setOutliningStyle("Function");
9174
9175
// Did we have to modify the stack by saving the link register?
9176
if (OF.FrameConstructionID != MachineOutlinerDefault)
9177
return;
9178
9179
// We modified the stack.
9180
// Walk over the basic block and fix up all the stack accesses.
9181
fixupPostOutline(MBB);
9182
}
9183
9184
MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9185
Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9186
MachineFunction &MF, outliner::Candidate &C) const {
9187
9188
// Are we tail calling?
9189
if (C.CallConstructionID == MachineOutlinerTailCall) {
9190
// If yes, then we can just branch to the label.
9191
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9192
.addGlobalAddress(M.getNamedValue(MF.getName()))
9193
.addImm(0));
9194
return It;
9195
}
9196
9197
// Are we saving the link register?
9198
if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9199
C.CallConstructionID == MachineOutlinerThunk) {
9200
// No, so just insert the call.
9201
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9202
.addGlobalAddress(M.getNamedValue(MF.getName())));
9203
return It;
9204
}
9205
9206
// We want to return the spot where we inserted the call.
9207
MachineBasicBlock::iterator CallPt;
9208
9209
// Instructions for saving and restoring LR around the call instruction we're
9210
// going to insert.
9211
MachineInstr *Save;
9212
MachineInstr *Restore;
9213
// Can we save to a register?
9214
if (C.CallConstructionID == MachineOutlinerRegSave) {
9215
// FIXME: This logic should be sunk into a target-specific interface so that
9216
// we don't have to recompute the register.
9217
Register Reg = findRegisterToSaveLRTo(C);
9218
assert(Reg && "No callee-saved register available?");
9219
9220
// LR has to be a live in so that we can save it.
9221
if (!MBB.isLiveIn(AArch64::LR))
9222
MBB.addLiveIn(AArch64::LR);
9223
9224
// Save and restore LR from Reg.
9225
Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9226
.addReg(AArch64::XZR)
9227
.addReg(AArch64::LR)
9228
.addImm(0);
9229
Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9230
.addReg(AArch64::XZR)
9231
.addReg(Reg)
9232
.addImm(0);
9233
} else {
9234
// We have the default case. Save and restore from SP.
9235
Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9236
.addReg(AArch64::SP, RegState::Define)
9237
.addReg(AArch64::LR)
9238
.addReg(AArch64::SP)
9239
.addImm(-16);
9240
Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9241
.addReg(AArch64::SP, RegState::Define)
9242
.addReg(AArch64::LR, RegState::Define)
9243
.addReg(AArch64::SP)
9244
.addImm(16);
9245
}
9246
9247
It = MBB.insert(It, Save);
9248
It++;
9249
9250
// Insert the call.
9251
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9252
.addGlobalAddress(M.getNamedValue(MF.getName())));
9253
CallPt = It;
9254
It++;
9255
9256
It = MBB.insert(It, Restore);
9257
return CallPt;
9258
}
9259
9260
bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9261
MachineFunction &MF) const {
9262
return MF.getFunction().hasMinSize();
9263
}
9264
9265
void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9266
MachineBasicBlock::iterator Iter,
9267
DebugLoc &DL,
9268
bool AllowSideEffects) const {
9269
const MachineFunction &MF = *MBB.getParent();
9270
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9271
const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9272
9273
if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9274
BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9275
} else if (STI.hasSVE()) {
9276
BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9277
.addImm(0)
9278
.addImm(0);
9279
} else {
9280
BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9281
.addImm(0);
9282
}
9283
}
9284
9285
std::optional<DestSourcePair>
9286
AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9287
9288
// AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9289
// and zero immediate operands used as an alias for mov instruction.
9290
if (MI.getOpcode() == AArch64::ORRWrs &&
9291
MI.getOperand(1).getReg() == AArch64::WZR &&
9292
MI.getOperand(3).getImm() == 0x0 &&
9293
// Check that the w->w move is not a zero-extending w->x mov.
9294
(!MI.getOperand(0).getReg().isVirtual() ||
9295
MI.getOperand(0).getSubReg() == 0) &&
9296
(!MI.getOperand(0).getReg().isPhysical() ||
9297
MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9298
AArch64::X0,
9299
/*TRI=*/nullptr) == -1))
9300
return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9301
9302
if (MI.getOpcode() == AArch64::ORRXrs &&
9303
MI.getOperand(1).getReg() == AArch64::XZR &&
9304
MI.getOperand(3).getImm() == 0x0)
9305
return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9306
9307
return std::nullopt;
9308
}
9309
9310
std::optional<DestSourcePair>
9311
AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9312
if (MI.getOpcode() == AArch64::ORRWrs &&
9313
MI.getOperand(1).getReg() == AArch64::WZR &&
9314
MI.getOperand(3).getImm() == 0x0)
9315
return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9316
return std::nullopt;
9317
}
9318
9319
std::optional<RegImmPair>
9320
AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9321
int Sign = 1;
9322
int64_t Offset = 0;
9323
9324
// TODO: Handle cases where Reg is a super- or sub-register of the
9325
// destination register.
9326
const MachineOperand &Op0 = MI.getOperand(0);
9327
if (!Op0.isReg() || Reg != Op0.getReg())
9328
return std::nullopt;
9329
9330
switch (MI.getOpcode()) {
9331
default:
9332
return std::nullopt;
9333
case AArch64::SUBWri:
9334
case AArch64::SUBXri:
9335
case AArch64::SUBSWri:
9336
case AArch64::SUBSXri:
9337
Sign *= -1;
9338
[[fallthrough]];
9339
case AArch64::ADDSWri:
9340
case AArch64::ADDSXri:
9341
case AArch64::ADDWri:
9342
case AArch64::ADDXri: {
9343
// TODO: Third operand can be global address (usually some string).
9344
if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9345
!MI.getOperand(2).isImm())
9346
return std::nullopt;
9347
int Shift = MI.getOperand(3).getImm();
9348
assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9349
Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9350
}
9351
}
9352
return RegImmPair{MI.getOperand(1).getReg(), Offset};
9353
}
9354
9355
/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9356
/// the destination register then, if possible, describe the value in terms of
9357
/// the source register.
9358
static std::optional<ParamLoadedValue>
9359
describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9360
const TargetInstrInfo *TII,
9361
const TargetRegisterInfo *TRI) {
9362
auto DestSrc = TII->isCopyLikeInstr(MI);
9363
if (!DestSrc)
9364
return std::nullopt;
9365
9366
Register DestReg = DestSrc->Destination->getReg();
9367
Register SrcReg = DestSrc->Source->getReg();
9368
9369
auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9370
9371
// If the described register is the destination, just return the source.
9372
if (DestReg == DescribedReg)
9373
return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9374
9375
// ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9376
if (MI.getOpcode() == AArch64::ORRWrs &&
9377
TRI->isSuperRegister(DestReg, DescribedReg))
9378
return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9379
9380
// We may need to describe the lower part of a ORRXrs move.
9381
if (MI.getOpcode() == AArch64::ORRXrs &&
9382
TRI->isSubRegister(DestReg, DescribedReg)) {
9383
Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9384
return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9385
}
9386
9387
assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9388
"Unhandled ORR[XW]rs copy case");
9389
9390
return std::nullopt;
9391
}
9392
9393
bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9394
// Functions cannot be split to different sections on AArch64 if they have
9395
// a red zone. This is because relaxing a cross-section branch may require
9396
// incrementing the stack pointer to spill a register, which would overwrite
9397
// the red zone.
9398
if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9399
return false;
9400
9401
return TargetInstrInfo::isFunctionSafeToSplit(MF);
9402
}
9403
9404
bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9405
const MachineBasicBlock &MBB) const {
9406
// Asm Goto blocks can contain conditional branches to goto labels, which can
9407
// get moved out of range of the branch instruction.
9408
auto isAsmGoto = [](const MachineInstr &MI) {
9409
return MI.getOpcode() == AArch64::INLINEASM_BR;
9410
};
9411
if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9412
return false;
9413
9414
// Because jump tables are label-relative instead of table-relative, they all
9415
// must be in the same section or relocation fixup handling will fail.
9416
9417
// Check if MBB is a jump table target
9418
const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9419
auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9420
return llvm::is_contained(JTE.MBBs, &MBB);
9421
};
9422
if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9423
return false;
9424
9425
// Check if MBB contains a jump table lookup
9426
for (const MachineInstr &MI : MBB) {
9427
switch (MI.getOpcode()) {
9428
case TargetOpcode::G_BRJT:
9429
case AArch64::JumpTableDest32:
9430
case AArch64::JumpTableDest16:
9431
case AArch64::JumpTableDest8:
9432
return false;
9433
default:
9434
continue;
9435
}
9436
}
9437
9438
// MBB isn't a special case, so it's safe to be split to the cold section.
9439
return true;
9440
}
9441
9442
std::optional<ParamLoadedValue>
9443
AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9444
Register Reg) const {
9445
const MachineFunction *MF = MI.getMF();
9446
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9447
switch (MI.getOpcode()) {
9448
case AArch64::MOVZWi:
9449
case AArch64::MOVZXi: {
9450
// MOVZWi may be used for producing zero-extended 32-bit immediates in
9451
// 64-bit parameters, so we need to consider super-registers.
9452
if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9453
return std::nullopt;
9454
9455
if (!MI.getOperand(1).isImm())
9456
return std::nullopt;
9457
int64_t Immediate = MI.getOperand(1).getImm();
9458
int Shift = MI.getOperand(2).getImm();
9459
return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9460
nullptr);
9461
}
9462
case AArch64::ORRWrs:
9463
case AArch64::ORRXrs:
9464
return describeORRLoadedValue(MI, Reg, this, TRI);
9465
}
9466
9467
return TargetInstrInfo::describeLoadedValue(MI, Reg);
9468
}
9469
9470
bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9471
MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9472
assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9473
ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9474
ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9475
9476
// Anyexts are nops.
9477
if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9478
return true;
9479
9480
Register DefReg = ExtMI.getOperand(0).getReg();
9481
if (!MRI.hasOneNonDBGUse(DefReg))
9482
return false;
9483
9484
// It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9485
// addressing mode.
9486
auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9487
return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9488
}
9489
9490
uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9491
return get(Opc).TSFlags & AArch64::ElementSizeMask;
9492
}
9493
9494
bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9495
return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9496
}
9497
9498
bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9499
return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9500
}
9501
9502
unsigned int
9503
AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9504
return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9505
}
9506
9507
bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9508
unsigned Scale) const {
9509
if (Offset && Scale)
9510
return false;
9511
9512
// Check Reg + Imm
9513
if (!Scale) {
9514
// 9-bit signed offset
9515
if (isInt<9>(Offset))
9516
return true;
9517
9518
// 12-bit unsigned offset
9519
unsigned Shift = Log2_64(NumBytes);
9520
if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9521
// Must be a multiple of NumBytes (NumBytes is a power of 2)
9522
(Offset >> Shift) << Shift == Offset)
9523
return true;
9524
return false;
9525
}
9526
9527
// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9528
return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9529
}
9530
9531
unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9532
if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9533
return AArch64::BLRNoIP;
9534
else
9535
return AArch64::BLR;
9536
}
9537
9538
MachineBasicBlock::iterator
9539
AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9540
Register TargetReg, bool FrameSetup) const {
9541
assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9542
9543
MachineBasicBlock &MBB = *MBBI->getParent();
9544
MachineFunction &MF = *MBB.getParent();
9545
const AArch64InstrInfo *TII =
9546
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9547
int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9548
DebugLoc DL = MBB.findDebugLoc(MBBI);
9549
9550
MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
9551
MachineBasicBlock *LoopTestMBB =
9552
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9553
MF.insert(MBBInsertPoint, LoopTestMBB);
9554
MachineBasicBlock *LoopBodyMBB =
9555
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9556
MF.insert(MBBInsertPoint, LoopBodyMBB);
9557
MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9558
MF.insert(MBBInsertPoint, ExitMBB);
9559
MachineInstr::MIFlag Flags =
9560
FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9561
9562
// LoopTest:
9563
// SUB SP, SP, #ProbeSize
9564
emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9565
AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9566
9567
// CMP SP, TargetReg
9568
BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9569
AArch64::XZR)
9570
.addReg(AArch64::SP)
9571
.addReg(TargetReg)
9572
.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9573
.setMIFlags(Flags);
9574
9575
// B.<Cond> LoopExit
9576
BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9577
.addImm(AArch64CC::LE)
9578
.addMBB(ExitMBB)
9579
.setMIFlags(Flags);
9580
9581
// STR XZR, [SP]
9582
BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9583
.addReg(AArch64::XZR)
9584
.addReg(AArch64::SP)
9585
.addImm(0)
9586
.setMIFlags(Flags);
9587
9588
// B loop
9589
BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9590
.addMBB(LoopTestMBB)
9591
.setMIFlags(Flags);
9592
9593
// LoopExit:
9594
// MOV SP, TargetReg
9595
BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9596
.addReg(TargetReg)
9597
.addImm(0)
9598
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9599
.setMIFlags(Flags);
9600
9601
// LDR XZR, [SP]
9602
BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9603
.addReg(AArch64::XZR, RegState::Define)
9604
.addReg(AArch64::SP)
9605
.addImm(0)
9606
.setMIFlags(Flags);
9607
9608
ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
9609
ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
9610
9611
LoopTestMBB->addSuccessor(ExitMBB);
9612
LoopTestMBB->addSuccessor(LoopBodyMBB);
9613
LoopBodyMBB->addSuccessor(LoopTestMBB);
9614
MBB.addSuccessor(LoopTestMBB);
9615
9616
// Update liveins.
9617
if (MF.getRegInfo().reservedRegsFrozen())
9618
fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
9619
9620
return ExitMBB->begin();
9621
}
9622
9623
namespace {
9624
class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
9625
MachineFunction *MF;
9626
const TargetInstrInfo *TII;
9627
const TargetRegisterInfo *TRI;
9628
MachineRegisterInfo &MRI;
9629
9630
/// The block of the loop
9631
MachineBasicBlock *LoopBB;
9632
/// The conditional branch of the loop
9633
MachineInstr *CondBranch;
9634
/// The compare instruction for loop control
9635
MachineInstr *Comp;
9636
/// The number of the operand of the loop counter value in Comp
9637
unsigned CompCounterOprNum;
9638
/// The instruction that updates the loop counter value
9639
MachineInstr *Update;
9640
/// The number of the operand of the loop counter value in Update
9641
unsigned UpdateCounterOprNum;
9642
/// The initial value of the loop counter
9643
Register Init;
9644
/// True iff Update is a predecessor of Comp
9645
bool IsUpdatePriorComp;
9646
9647
/// The normalized condition used by createTripCountGreaterCondition()
9648
SmallVector<MachineOperand, 4> Cond;
9649
9650
public:
9651
AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
9652
MachineInstr *Comp, unsigned CompCounterOprNum,
9653
MachineInstr *Update, unsigned UpdateCounterOprNum,
9654
Register Init, bool IsUpdatePriorComp,
9655
const SmallVectorImpl<MachineOperand> &Cond)
9656
: MF(Comp->getParent()->getParent()),
9657
TII(MF->getSubtarget().getInstrInfo()),
9658
TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
9659
LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
9660
CompCounterOprNum(CompCounterOprNum), Update(Update),
9661
UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
9662
IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
9663
9664
bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
9665
// Make the instructions for loop control be placed in stage 0.
9666
// The predecessors of Comp are considered by the caller.
9667
return MI == Comp;
9668
}
9669
9670
std::optional<bool> createTripCountGreaterCondition(
9671
int TC, MachineBasicBlock &MBB,
9672
SmallVectorImpl<MachineOperand> &CondParam) override {
9673
// A branch instruction will be inserted as "if (Cond) goto epilogue".
9674
// Cond is normalized for such use.
9675
// The predecessors of the branch are assumed to have already been inserted.
9676
CondParam = Cond;
9677
return {};
9678
}
9679
9680
void createRemainingIterationsGreaterCondition(
9681
int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9682
DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
9683
9684
void setPreheader(MachineBasicBlock *NewPreheader) override {}
9685
9686
void adjustTripCount(int TripCountAdjust) override {}
9687
9688
void disposed() override {}
9689
bool isMVEExpanderSupported() override { return true; }
9690
};
9691
} // namespace
9692
9693
/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
9694
/// is replaced by ReplaceReg. The output register is newly created.
9695
/// The other operands are unchanged from MI.
9696
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
9697
Register ReplaceReg, MachineBasicBlock &MBB,
9698
MachineBasicBlock::iterator InsertTo) {
9699
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9700
const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
9701
const TargetRegisterInfo *TRI =
9702
MBB.getParent()->getSubtarget().getRegisterInfo();
9703
MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
9704
Register Result = 0;
9705
for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
9706
if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
9707
Result = MRI.createVirtualRegister(
9708
MRI.getRegClass(NewMI->getOperand(0).getReg()));
9709
NewMI->getOperand(I).setReg(Result);
9710
} else if (I == ReplaceOprNum) {
9711
MRI.constrainRegClass(
9712
ReplaceReg,
9713
TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
9714
NewMI->getOperand(I).setReg(ReplaceReg);
9715
}
9716
}
9717
MBB.insert(InsertTo, NewMI);
9718
return Result;
9719
}
9720
9721
void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
9722
int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9723
DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
9724
// Create and accumulate conditions for next TC iterations.
9725
// Example:
9726
// SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
9727
// # iteration of the kernel
9728
//
9729
// # insert the following instructions
9730
// cond = CSINCXr 0, 0, C, implicit $nzcv
9731
// counter = ADDXri counter, 1 # clone from this->Update
9732
// SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
9733
// cond = CSINCXr cond, cond, C, implicit $nzcv
9734
// ... (repeat TC times)
9735
// SUBSXri cond, 0, implicit-def $nzcv
9736
9737
assert(CondBranch->getOpcode() == AArch64::Bcc);
9738
// CondCode to exit the loop
9739
AArch64CC::CondCode CC =
9740
(AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
9741
if (CondBranch->getOperand(1).getMBB() == LoopBB)
9742
CC = AArch64CC::getInvertedCondCode(CC);
9743
9744
// Accumulate conditions to exit the loop
9745
Register AccCond = AArch64::XZR;
9746
9747
// If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
9748
auto AccumulateCond = [&](Register CurCond,
9749
AArch64CC::CondCode CC) -> Register {
9750
Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
9751
BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
9752
.addReg(NewCond, RegState::Define)
9753
.addReg(CurCond)
9754
.addReg(CurCond)
9755
.addImm(AArch64CC::getInvertedCondCode(CC));
9756
return NewCond;
9757
};
9758
9759
if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
9760
// Update and Comp for I==0 are already exists in MBB
9761
// (MBB is an unrolled kernel)
9762
Register Counter;
9763
for (int I = 0; I <= TC; ++I) {
9764
Register NextCounter;
9765
if (I != 0)
9766
NextCounter =
9767
cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9768
9769
AccCond = AccumulateCond(AccCond, CC);
9770
9771
if (I != TC) {
9772
if (I == 0) {
9773
if (Update != Comp && IsUpdatePriorComp) {
9774
Counter =
9775
LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9776
NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
9777
MBB.end());
9778
} else {
9779
// can use already calculated value
9780
NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
9781
}
9782
} else if (Update != Comp) {
9783
NextCounter =
9784
cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9785
}
9786
}
9787
Counter = NextCounter;
9788
}
9789
} else {
9790
Register Counter;
9791
if (LastStage0Insts.empty()) {
9792
// use initial counter value (testing if the trip count is sufficient to
9793
// be executed by pipelined code)
9794
Counter = Init;
9795
if (IsUpdatePriorComp)
9796
Counter =
9797
cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9798
} else {
9799
// MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
9800
Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9801
}
9802
9803
for (int I = 0; I <= TC; ++I) {
9804
Register NextCounter;
9805
NextCounter =
9806
cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9807
AccCond = AccumulateCond(AccCond, CC);
9808
if (I != TC && Update != Comp)
9809
NextCounter =
9810
cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9811
Counter = NextCounter;
9812
}
9813
}
9814
9815
// If AccCond == 0, the remainder is greater than TC.
9816
BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
9817
.addReg(AArch64::XZR, RegState::Define | RegState::Dead)
9818
.addReg(AccCond)
9819
.addImm(0)
9820
.addImm(0);
9821
Cond.clear();
9822
Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
9823
}
9824
9825
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
9826
Register &RegMBB, Register &RegOther) {
9827
assert(Phi.getNumOperands() == 5);
9828
if (Phi.getOperand(2).getMBB() == MBB) {
9829
RegMBB = Phi.getOperand(1).getReg();
9830
RegOther = Phi.getOperand(3).getReg();
9831
} else {
9832
assert(Phi.getOperand(4).getMBB() == MBB);
9833
RegMBB = Phi.getOperand(3).getReg();
9834
RegOther = Phi.getOperand(1).getReg();
9835
}
9836
}
9837
9838
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
9839
if (!Reg.isVirtual())
9840
return false;
9841
const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
9842
return MRI.getVRegDef(Reg)->getParent() != BB;
9843
}
9844
9845
/// If Reg is an induction variable, return true and set some parameters
9846
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
9847
MachineInstr *&UpdateInst,
9848
unsigned &UpdateCounterOprNum, Register &InitReg,
9849
bool &IsUpdatePriorComp) {
9850
// Example:
9851
//
9852
// Preheader:
9853
// InitReg = ...
9854
// LoopBB:
9855
// Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
9856
// Reg = COPY Reg0 ; COPY is ignored.
9857
// Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
9858
// ; Reg is the value calculated in the previous
9859
// ; iteration, so IsUpdatePriorComp == false.
9860
9861
if (LoopBB->pred_size() != 2)
9862
return false;
9863
if (!Reg.isVirtual())
9864
return false;
9865
const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
9866
UpdateInst = nullptr;
9867
UpdateCounterOprNum = 0;
9868
InitReg = 0;
9869
IsUpdatePriorComp = true;
9870
Register CurReg = Reg;
9871
while (true) {
9872
MachineInstr *Def = MRI.getVRegDef(CurReg);
9873
if (Def->getParent() != LoopBB)
9874
return false;
9875
if (Def->isCopy()) {
9876
// Ignore copy instructions unless they contain subregisters
9877
if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
9878
return false;
9879
CurReg = Def->getOperand(1).getReg();
9880
} else if (Def->isPHI()) {
9881
if (InitReg != 0)
9882
return false;
9883
if (!UpdateInst)
9884
IsUpdatePriorComp = false;
9885
extractPhiReg(*Def, LoopBB, CurReg, InitReg);
9886
} else {
9887
if (UpdateInst)
9888
return false;
9889
switch (Def->getOpcode()) {
9890
case AArch64::ADDSXri:
9891
case AArch64::ADDSWri:
9892
case AArch64::SUBSXri:
9893
case AArch64::SUBSWri:
9894
case AArch64::ADDXri:
9895
case AArch64::ADDWri:
9896
case AArch64::SUBXri:
9897
case AArch64::SUBWri:
9898
UpdateInst = Def;
9899
UpdateCounterOprNum = 1;
9900
break;
9901
case AArch64::ADDSXrr:
9902
case AArch64::ADDSWrr:
9903
case AArch64::SUBSXrr:
9904
case AArch64::SUBSWrr:
9905
case AArch64::ADDXrr:
9906
case AArch64::ADDWrr:
9907
case AArch64::SUBXrr:
9908
case AArch64::SUBWrr:
9909
UpdateInst = Def;
9910
if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
9911
UpdateCounterOprNum = 1;
9912
else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
9913
UpdateCounterOprNum = 2;
9914
else
9915
return false;
9916
break;
9917
default:
9918
return false;
9919
}
9920
CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
9921
}
9922
9923
if (!CurReg.isVirtual())
9924
return false;
9925
if (Reg == CurReg)
9926
break;
9927
}
9928
9929
if (!UpdateInst)
9930
return false;
9931
9932
return true;
9933
}
9934
9935
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
9936
AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
9937
// Accept loops that meet the following conditions
9938
// * The conditional branch is BCC
9939
// * The compare instruction is ADDS/SUBS/WHILEXX
9940
// * One operand of the compare is an induction variable and the other is a
9941
// loop invariant value
9942
// * The induction variable is incremented/decremented by a single instruction
9943
// * Does not contain CALL or instructions which have unmodeled side effects
9944
9945
for (MachineInstr &MI : *LoopBB)
9946
if (MI.isCall() || MI.hasUnmodeledSideEffects())
9947
// This instruction may use NZCV, which interferes with the instruction to
9948
// be inserted for loop control.
9949
return nullptr;
9950
9951
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
9952
SmallVector<MachineOperand, 4> Cond;
9953
if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
9954
return nullptr;
9955
9956
// Infinite loops are not supported
9957
if (TBB == LoopBB && FBB == LoopBB)
9958
return nullptr;
9959
9960
// Must be conditional branch
9961
if (TBB != LoopBB && FBB == nullptr)
9962
return nullptr;
9963
9964
assert((TBB == LoopBB || FBB == LoopBB) &&
9965
"The Loop must be a single-basic-block loop");
9966
9967
MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
9968
const TargetRegisterInfo &TRI = getRegisterInfo();
9969
9970
if (CondBranch->getOpcode() != AArch64::Bcc)
9971
return nullptr;
9972
9973
// Normalization for createTripCountGreaterCondition()
9974
if (TBB == LoopBB)
9975
reverseBranchCondition(Cond);
9976
9977
MachineInstr *Comp = nullptr;
9978
unsigned CompCounterOprNum = 0;
9979
for (MachineInstr &MI : reverse(*LoopBB)) {
9980
if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
9981
// Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
9982
// operands is a loop invariant value
9983
9984
switch (MI.getOpcode()) {
9985
case AArch64::SUBSXri:
9986
case AArch64::SUBSWri:
9987
case AArch64::ADDSXri:
9988
case AArch64::ADDSWri:
9989
Comp = &MI;
9990
CompCounterOprNum = 1;
9991
break;
9992
case AArch64::ADDSWrr:
9993
case AArch64::ADDSXrr:
9994
case AArch64::SUBSWrr:
9995
case AArch64::SUBSXrr:
9996
Comp = &MI;
9997
break;
9998
default:
9999
if (isWhileOpcode(MI.getOpcode())) {
10000
Comp = &MI;
10001
break;
10002
}
10003
return nullptr;
10004
}
10005
10006
if (CompCounterOprNum == 0) {
10007
if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10008
CompCounterOprNum = 2;
10009
else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10010
CompCounterOprNum = 1;
10011
else
10012
return nullptr;
10013
}
10014
break;
10015
}
10016
}
10017
if (!Comp)
10018
return nullptr;
10019
10020
MachineInstr *Update = nullptr;
10021
Register Init;
10022
bool IsUpdatePriorComp;
10023
unsigned UpdateCounterOprNum;
10024
if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10025
Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10026
return nullptr;
10027
10028
return std::make_unique<AArch64PipelinerLoopInfo>(
10029
LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10030
Init, IsUpdatePriorComp, Cond);
10031
}
10032
10033
#define GET_INSTRINFO_HELPERS
10034
#define GET_INSTRMAP_INFO
10035
#include "AArch64GenInstrInfo.inc"
10036
10037