Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
35267 views
1
//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass performs below peephole optimizations on MIR level.
10
//
11
// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12
// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13
//
14
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16
//
17
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19
//
20
// The mov pseudo instruction could be expanded to multiple mov instructions
21
// later. In this case, we could try to split the constant operand of mov
22
// instruction into two immediates which can be directly encoded into
23
// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24
// multiple `mov` + `and/add/sub` instructions.
25
//
26
// 4. Remove redundant ORRWrs which is generated by zero-extend.
27
//
28
// %3:gpr32 = ORRWrs $wzr, %2, 0
29
// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30
//
31
// If AArch64's 32-bit form of instruction defines the source operand of
32
// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33
// operand are set to zero.
34
//
35
// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36
// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
37
//
38
// 6. %intermediate:gpr32 = COPY %src:fpr128
39
// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40
// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41
//
42
// In cases where a source FPR is copied to a GPR in order to be copied
43
// to a destination FPR, we can directly copy the values between the FPRs,
44
// eliminating the use of the Integer unit. When we match a pattern of
45
// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46
// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47
// instructions.
48
//
49
// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50
// 64-bits. For example,
51
//
52
// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53
// %2:fpr64 = MOVID 0
54
// %4:fpr128 = IMPLICIT_DEF
55
// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56
// %6:fpr128 = IMPLICIT_DEF
57
// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58
// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59
// ==>
60
// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61
// %6:fpr128 = IMPLICIT_DEF
62
// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63
//
64
//===----------------------------------------------------------------------===//
65
66
#include "AArch64ExpandImm.h"
67
#include "AArch64InstrInfo.h"
68
#include "MCTargetDesc/AArch64AddressingModes.h"
69
#include "llvm/CodeGen/MachineDominators.h"
70
#include "llvm/CodeGen/MachineLoopInfo.h"
71
72
using namespace llvm;
73
74
#define DEBUG_TYPE "aarch64-mi-peephole-opt"
75
76
namespace {
77
78
struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79
static char ID;
80
81
AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
82
initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
83
}
84
85
const AArch64InstrInfo *TII;
86
const AArch64RegisterInfo *TRI;
87
MachineLoopInfo *MLI;
88
MachineRegisterInfo *MRI;
89
90
using OpcodePair = std::pair<unsigned, unsigned>;
91
template <typename T>
92
using SplitAndOpcFunc =
93
std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94
using BuildMIFunc =
95
std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
96
Register, Register, Register)>;
97
98
/// For instructions where an immediate operand could be split into two
99
/// separate immediate instructions, use the splitTwoPartImm two handle the
100
/// optimization.
101
///
102
/// To implement, the following function types must be passed to
103
/// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104
/// splitting the immediate is valid and returns the associated new opcode. A
105
/// BuildMIFunc must be implemented to build the two immediate instructions.
106
///
107
/// Example Pattern (where IMM would require 2+ MOV instructions):
108
/// %dst = <Instr>rr %src IMM [...]
109
/// becomes:
110
/// %tmp = <Instr>ri %src (encode half IMM) [...]
111
/// %dst = <Instr>ri %tmp (encode half IMM) [...]
112
template <typename T>
113
bool splitTwoPartImm(MachineInstr &MI,
114
SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115
116
bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117
MachineInstr *&SubregToRegMI);
118
119
template <typename T>
120
bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121
template <typename T>
122
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123
124
template <typename T>
125
bool visitAND(unsigned Opc, MachineInstr &MI);
126
bool visitORR(MachineInstr &MI);
127
bool visitINSERT(MachineInstr &MI);
128
bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129
bool visitINSvi64lane(MachineInstr &MI);
130
bool visitFMOVDr(MachineInstr &MI);
131
bool visitCopy(MachineInstr &MI);
132
bool runOnMachineFunction(MachineFunction &MF) override;
133
134
StringRef getPassName() const override {
135
return "AArch64 MI Peephole Optimization pass";
136
}
137
138
void getAnalysisUsage(AnalysisUsage &AU) const override {
139
AU.setPreservesCFG();
140
AU.addRequired<MachineLoopInfoWrapperPass>();
141
MachineFunctionPass::getAnalysisUsage(AU);
142
}
143
};
144
145
char AArch64MIPeepholeOpt::ID = 0;
146
147
} // end anonymous namespace
148
149
INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
150
"AArch64 MI Peephole Optimization", false, false)
151
152
template <typename T>
153
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
154
T UImm = static_cast<T>(Imm);
155
if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
156
return false;
157
158
// If this immediate can be handled by one instruction, do not split it.
159
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
160
AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
161
if (Insn.size() == 1)
162
return false;
163
164
// The bitmask immediate consists of consecutive ones. Let's say there is
165
// constant 0b00000000001000000000010000000000 which does not consist of
166
// consecutive ones. We can split it in to two bitmask immediate like
167
// 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
168
// If we do AND with these two bitmask immediate, we can see original one.
169
unsigned LowestBitSet = llvm::countr_zero(UImm);
170
unsigned HighestBitSet = Log2_64(UImm);
171
172
// Create a mask which is filled with one from the position of lowest bit set
173
// to the position of highest bit set.
174
T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
175
(static_cast<T>(1) << LowestBitSet);
176
// Create a mask which is filled with one outside the position of lowest bit
177
// set and the position of highest bit set.
178
T NewImm2 = UImm | ~NewImm1;
179
180
// If the split value is not valid bitmask immediate, do not split this
181
// constant.
182
if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
183
return false;
184
185
Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
186
Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
187
return true;
188
}
189
190
template <typename T>
191
bool AArch64MIPeepholeOpt::visitAND(
192
unsigned Opc, MachineInstr &MI) {
193
// Try below transformation.
194
//
195
// MOVi32imm + ANDWrr ==> ANDWri + ANDWri
196
// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
197
//
198
// The mov pseudo instruction could be expanded to multiple mov instructions
199
// later. Let's try to split the constant operand of mov instruction into two
200
// bitmask immediates. It makes only two AND instructions intead of multiple
201
// mov + and instructions.
202
203
return splitTwoPartImm<T>(
204
MI,
205
[Opc](T Imm, unsigned RegSize, T &Imm0,
206
T &Imm1) -> std::optional<OpcodePair> {
207
if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
208
return std::make_pair(Opc, Opc);
209
return std::nullopt;
210
},
211
[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
212
unsigned Imm1, Register SrcReg, Register NewTmpReg,
213
Register NewDstReg) {
214
DebugLoc DL = MI.getDebugLoc();
215
MachineBasicBlock *MBB = MI.getParent();
216
BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
217
.addReg(SrcReg)
218
.addImm(Imm0);
219
BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
220
.addReg(NewTmpReg)
221
.addImm(Imm1);
222
});
223
}
224
225
bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
226
// Check this ORR comes from below zero-extend pattern.
227
//
228
// def : Pat<(i64 (zext GPR32:$src)),
229
// (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
230
if (MI.getOperand(3).getImm() != 0)
231
return false;
232
233
if (MI.getOperand(1).getReg() != AArch64::WZR)
234
return false;
235
236
MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
237
if (!SrcMI)
238
return false;
239
240
// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
241
//
242
// When you use the 32-bit form of an instruction, the upper 32 bits of the
243
// source registers are ignored and the upper 32 bits of the destination
244
// register are set to zero.
245
//
246
// If AArch64's 32-bit form of instruction defines the source operand of
247
// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
248
// real AArch64 instruction and if it is not, do not process the opcode
249
// conservatively.
250
if (SrcMI->getOpcode() == TargetOpcode::COPY &&
251
SrcMI->getOperand(1).getReg().isVirtual()) {
252
const TargetRegisterClass *RC =
253
MRI->getRegClass(SrcMI->getOperand(1).getReg());
254
255
// A COPY from an FPR will become a FMOVSWr, so do so now so that we know
256
// that the upper bits are zero.
257
if (RC != &AArch64::FPR32RegClass &&
258
((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
259
SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
260
return false;
261
Register CpySrc = SrcMI->getOperand(1).getReg();
262
if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
263
CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
264
BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
265
TII->get(TargetOpcode::COPY), CpySrc)
266
.add(SrcMI->getOperand(1));
267
}
268
BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
269
TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
270
.addReg(CpySrc);
271
SrcMI->eraseFromParent();
272
}
273
else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
274
return false;
275
276
Register DefReg = MI.getOperand(0).getReg();
277
Register SrcReg = MI.getOperand(2).getReg();
278
MRI->replaceRegWith(DefReg, SrcReg);
279
MRI->clearKillFlags(SrcReg);
280
LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
281
MI.eraseFromParent();
282
283
return true;
284
}
285
286
bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
287
// Check this INSERT_SUBREG comes from below zero-extend pattern.
288
//
289
// From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
290
// To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
291
//
292
// We're assuming the first operand to INSERT_SUBREG is irrelevant because a
293
// COPY would destroy the upper part of the register anyway
294
if (!MI.isRegTiedToDefOperand(1))
295
return false;
296
297
Register DstReg = MI.getOperand(0).getReg();
298
const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
299
MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
300
if (!SrcMI)
301
return false;
302
303
// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
304
//
305
// When you use the 32-bit form of an instruction, the upper 32 bits of the
306
// source registers are ignored and the upper 32 bits of the destination
307
// register are set to zero.
308
//
309
// If AArch64's 32-bit form of instruction defines the source operand of
310
// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
311
// real AArch64 instruction and if it is not, do not process the opcode
312
// conservatively.
313
if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
314
!AArch64::GPR64allRegClass.hasSubClassEq(RC))
315
return false;
316
317
// Build a SUBREG_TO_REG instruction
318
MachineInstr *SubregMI =
319
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
320
TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
321
.addImm(0)
322
.add(MI.getOperand(2))
323
.add(MI.getOperand(3));
324
LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
325
(void)SubregMI;
326
MI.eraseFromParent();
327
328
return true;
329
}
330
331
template <typename T>
332
static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
333
// The immediate must be in the form of ((imm0 << 12) + imm1), in which both
334
// imm0 and imm1 are non-zero 12-bit unsigned int.
335
if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
336
(Imm & ~static_cast<T>(0xffffff)) != 0)
337
return false;
338
339
// The immediate can not be composed via a single instruction.
340
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
341
AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
342
if (Insn.size() == 1)
343
return false;
344
345
// Split Imm into (Imm0 << 12) + Imm1;
346
Imm0 = (Imm >> 12) & 0xfff;
347
Imm1 = Imm & 0xfff;
348
return true;
349
}
350
351
template <typename T>
352
bool AArch64MIPeepholeOpt::visitADDSUB(
353
unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
354
// Try below transformation.
355
//
356
// ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
357
// ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
358
//
359
// SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
360
// SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
361
//
362
// The mov pseudo instruction could be expanded to multiple mov instructions
363
// later. Let's try to split the constant operand of mov instruction into two
364
// legal add/sub immediates. It makes only two ADD/SUB instructions intead of
365
// multiple `mov` + `and/sub` instructions.
366
367
// We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
368
// folded. Make sure that we don't generate invalid instructions that use XZR
369
// in those cases.
370
if (MI.getOperand(1).getReg() == AArch64::XZR ||
371
MI.getOperand(1).getReg() == AArch64::WZR)
372
return false;
373
374
return splitTwoPartImm<T>(
375
MI,
376
[PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
377
T &Imm1) -> std::optional<OpcodePair> {
378
if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
379
return std::make_pair(PosOpc, PosOpc);
380
if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
381
return std::make_pair(NegOpc, NegOpc);
382
return std::nullopt;
383
},
384
[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
385
unsigned Imm1, Register SrcReg, Register NewTmpReg,
386
Register NewDstReg) {
387
DebugLoc DL = MI.getDebugLoc();
388
MachineBasicBlock *MBB = MI.getParent();
389
BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
390
.addReg(SrcReg)
391
.addImm(Imm0)
392
.addImm(12);
393
BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
394
.addReg(NewTmpReg)
395
.addImm(Imm1)
396
.addImm(0);
397
});
398
}
399
400
template <typename T>
401
bool AArch64MIPeepholeOpt::visitADDSSUBS(
402
OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
403
// Try the same transformation as ADDSUB but with additional requirement
404
// that the condition code usages are only for Equal and Not Equal
405
406
if (MI.getOperand(1).getReg() == AArch64::XZR ||
407
MI.getOperand(1).getReg() == AArch64::WZR)
408
return false;
409
410
return splitTwoPartImm<T>(
411
MI,
412
[PosOpcs, NegOpcs, &MI, &TRI = TRI,
413
&MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
414
T &Imm1) -> std::optional<OpcodePair> {
415
OpcodePair OP;
416
if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
417
OP = PosOpcs;
418
else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
419
OP = NegOpcs;
420
else
421
return std::nullopt;
422
// Check conditional uses last since it is expensive for scanning
423
// proceeding instructions
424
MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
425
std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
426
if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
427
return std::nullopt;
428
return OP;
429
},
430
[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
431
unsigned Imm1, Register SrcReg, Register NewTmpReg,
432
Register NewDstReg) {
433
DebugLoc DL = MI.getDebugLoc();
434
MachineBasicBlock *MBB = MI.getParent();
435
BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
436
.addReg(SrcReg)
437
.addImm(Imm0)
438
.addImm(12);
439
BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
440
.addReg(NewTmpReg)
441
.addImm(Imm1)
442
.addImm(0);
443
});
444
}
445
446
// Checks if the corresponding MOV immediate instruction is applicable for
447
// this peephole optimization.
448
bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
449
MachineInstr *&MovMI,
450
MachineInstr *&SubregToRegMI) {
451
// Check whether current MBB is in loop and the AND is loop invariant.
452
MachineBasicBlock *MBB = MI.getParent();
453
MachineLoop *L = MLI->getLoopFor(MBB);
454
if (L && !L->isLoopInvariant(MI))
455
return false;
456
457
// Check whether current MI's operand is MOV with immediate.
458
MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
459
if (!MovMI)
460
return false;
461
462
// If it is SUBREG_TO_REG, check its operand.
463
SubregToRegMI = nullptr;
464
if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
465
SubregToRegMI = MovMI;
466
MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
467
if (!MovMI)
468
return false;
469
}
470
471
if (MovMI->getOpcode() != AArch64::MOVi32imm &&
472
MovMI->getOpcode() != AArch64::MOVi64imm)
473
return false;
474
475
// If the MOV has multiple uses, do not split the immediate because it causes
476
// more instructions.
477
if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
478
return false;
479
if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
480
return false;
481
482
// It is OK to perform this peephole optimization.
483
return true;
484
}
485
486
template <typename T>
487
bool AArch64MIPeepholeOpt::splitTwoPartImm(
488
MachineInstr &MI,
489
SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
490
unsigned RegSize = sizeof(T) * 8;
491
assert((RegSize == 32 || RegSize == 64) &&
492
"Invalid RegSize for legal immediate peephole optimization");
493
494
// Perform several essential checks against current MI.
495
MachineInstr *MovMI, *SubregToRegMI;
496
if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
497
return false;
498
499
// Split the immediate to Imm0 and Imm1, and calculate the Opcode.
500
T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
501
// For the 32 bit form of instruction, the upper 32 bits of the destination
502
// register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
503
// of Imm to zero. This is essential if the Immediate value was a negative
504
// number since it was sign extended when we assign to the 64-bit Imm.
505
if (SubregToRegMI)
506
Imm &= 0xFFFFFFFF;
507
OpcodePair Opcode;
508
if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
509
Opcode = *R;
510
else
511
return false;
512
513
// Create new MIs using the first and second opcodes. Opcodes might differ for
514
// flag setting operations that should only set flags on second instruction.
515
// NewTmpReg = Opcode.first SrcReg Imm0
516
// NewDstReg = Opcode.second NewTmpReg Imm1
517
518
// Determine register classes for destinations and register operands
519
MachineFunction *MF = MI.getMF();
520
const TargetRegisterClass *FirstInstrDstRC =
521
TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
522
const TargetRegisterClass *FirstInstrOperandRC =
523
TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
524
const TargetRegisterClass *SecondInstrDstRC =
525
(Opcode.first == Opcode.second)
526
? FirstInstrDstRC
527
: TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
528
const TargetRegisterClass *SecondInstrOperandRC =
529
(Opcode.first == Opcode.second)
530
? FirstInstrOperandRC
531
: TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
532
533
// Get old registers destinations and new register destinations
534
Register DstReg = MI.getOperand(0).getReg();
535
Register SrcReg = MI.getOperand(1).getReg();
536
Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
537
// In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
538
// reuse that same destination register.
539
Register NewDstReg = DstReg.isVirtual()
540
? MRI->createVirtualRegister(SecondInstrDstRC)
541
: DstReg;
542
543
// Constrain registers based on their new uses
544
MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
545
MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
546
if (DstReg != NewDstReg)
547
MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
548
549
// Call the delegating operation to build the instruction
550
BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
551
552
// replaceRegWith changes MI's definition register. Keep it for SSA form until
553
// deleting MI. Only if we made a new destination register.
554
if (DstReg != NewDstReg) {
555
MRI->replaceRegWith(DstReg, NewDstReg);
556
MI.getOperand(0).setReg(DstReg);
557
}
558
559
// Record the MIs need to be removed.
560
MI.eraseFromParent();
561
if (SubregToRegMI)
562
SubregToRegMI->eraseFromParent();
563
MovMI->eraseFromParent();
564
565
return true;
566
}
567
568
bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
569
// Check if this INSvi[X]gpr comes from COPY of a source FPR128
570
//
571
// From
572
// %intermediate1:gpr64 = COPY %src:fpr128
573
// %intermediate2:gpr32 = COPY %intermediate1:gpr64
574
// %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
575
// To
576
// %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
577
// src_index
578
// where src_index = 0, X = [8|16|32|64]
579
580
MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
581
582
// For a chain of COPY instructions, find the initial source register
583
// and check if it's an FPR128
584
while (true) {
585
if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
586
return false;
587
588
if (!SrcMI->getOperand(1).getReg().isVirtual())
589
return false;
590
591
if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
592
&AArch64::FPR128RegClass) {
593
break;
594
}
595
SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
596
}
597
598
Register DstReg = MI.getOperand(0).getReg();
599
Register SrcReg = SrcMI->getOperand(1).getReg();
600
MachineInstr *INSvilaneMI =
601
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
602
.add(MI.getOperand(1))
603
.add(MI.getOperand(2))
604
.addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
605
.addImm(0);
606
607
LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
608
(void)INSvilaneMI;
609
MI.eraseFromParent();
610
return true;
611
}
612
613
// All instructions that set a FPR64 will implicitly zero the top bits of the
614
// register.
615
static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
616
MachineRegisterInfo *MRI) {
617
if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
618
return false;
619
const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
620
if (RC != &AArch64::FPR64RegClass)
621
return false;
622
return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
623
}
624
625
bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
626
// Check the MI for low 64-bits sets zero for high 64-bits implicitly.
627
// We are expecting below case.
628
//
629
// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
630
// %6:fpr128 = IMPLICIT_DEF
631
// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
632
// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
633
MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
634
if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
635
return false;
636
Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
637
if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
638
return false;
639
640
// Check there is `mov 0` MI for high 64-bits.
641
// We are expecting below cases.
642
//
643
// %2:fpr64 = MOVID 0
644
// %4:fpr128 = IMPLICIT_DEF
645
// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
646
// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
647
// or
648
// %5:fpr128 = MOVIv2d_ns 0
649
// %6:fpr64 = COPY %5.dsub:fpr128
650
// %8:fpr128 = IMPLICIT_DEF
651
// %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
652
// %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
653
MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
654
if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
655
return false;
656
High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
657
if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
658
High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
659
if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
660
High64MI->getOpcode() != AArch64::MOVIv2d_ns))
661
return false;
662
if (High64MI->getOperand(1).getImm() != 0)
663
return false;
664
665
// Let's remove MIs for high 64-bits.
666
Register OldDef = MI.getOperand(0).getReg();
667
Register NewDef = MI.getOperand(1).getReg();
668
MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
669
MRI->replaceRegWith(OldDef, NewDef);
670
MI.eraseFromParent();
671
672
return true;
673
}
674
675
bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
676
// An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
677
MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
678
if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
679
return false;
680
681
// Let's remove MIs for high 64-bits.
682
Register OldDef = MI.getOperand(0).getReg();
683
Register NewDef = MI.getOperand(1).getReg();
684
LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
685
MRI->clearKillFlags(OldDef);
686
MRI->clearKillFlags(NewDef);
687
MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
688
MRI->replaceRegWith(OldDef, NewDef);
689
MI.eraseFromParent();
690
691
return true;
692
}
693
694
// Across a basic-block we might have in i32 extract from a value that only
695
// operates on upper bits (for example a sxtw). We can replace the COPY with a
696
// new version skipping the sxtw.
697
bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
698
Register InputReg = MI.getOperand(1).getReg();
699
if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
700
!MRI->hasOneNonDBGUse(InputReg))
701
return false;
702
703
MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
704
SmallPtrSet<MachineInstr *, 4> DeadInstrs;
705
DeadInstrs.insert(SrcMI);
706
while (SrcMI && SrcMI->isFullCopy() &&
707
MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {
708
SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
709
DeadInstrs.insert(SrcMI);
710
}
711
712
if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri ||
713
SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31)
714
return false;
715
716
Register SrcReg = SrcMI->getOperand(1).getReg();
717
MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
718
LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
719
MI.getOperand(1).setReg(SrcReg);
720
LLVM_DEBUG(dbgs() << " to: " << MI);
721
for (auto *DeadMI : DeadInstrs) {
722
LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI);
723
DeadMI->eraseFromParent();
724
}
725
return true;
726
}
727
728
bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
729
if (skipFunction(MF.getFunction()))
730
return false;
731
732
TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
733
TRI = static_cast<const AArch64RegisterInfo *>(
734
MF.getSubtarget().getRegisterInfo());
735
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
736
MRI = &MF.getRegInfo();
737
738
assert(MRI->isSSA() && "Expected to be run on SSA form!");
739
740
bool Changed = false;
741
742
for (MachineBasicBlock &MBB : MF) {
743
for (MachineInstr &MI : make_early_inc_range(MBB)) {
744
switch (MI.getOpcode()) {
745
default:
746
break;
747
case AArch64::INSERT_SUBREG:
748
Changed |= visitINSERT(MI);
749
break;
750
case AArch64::ANDWrr:
751
Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
752
break;
753
case AArch64::ANDXrr:
754
Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
755
break;
756
case AArch64::ORRWrs:
757
Changed |= visitORR(MI);
758
break;
759
case AArch64::ADDWrr:
760
Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
761
break;
762
case AArch64::SUBWrr:
763
Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
764
break;
765
case AArch64::ADDXrr:
766
Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
767
break;
768
case AArch64::SUBXrr:
769
Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
770
break;
771
case AArch64::ADDSWrr:
772
Changed |=
773
visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
774
{AArch64::SUBWri, AArch64::SUBSWri}, MI);
775
break;
776
case AArch64::SUBSWrr:
777
Changed |=
778
visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
779
{AArch64::ADDWri, AArch64::ADDSWri}, MI);
780
break;
781
case AArch64::ADDSXrr:
782
Changed |=
783
visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
784
{AArch64::SUBXri, AArch64::SUBSXri}, MI);
785
break;
786
case AArch64::SUBSXrr:
787
Changed |=
788
visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
789
{AArch64::ADDXri, AArch64::ADDSXri}, MI);
790
break;
791
case AArch64::INSvi64gpr:
792
Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
793
break;
794
case AArch64::INSvi32gpr:
795
Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
796
break;
797
case AArch64::INSvi16gpr:
798
Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
799
break;
800
case AArch64::INSvi8gpr:
801
Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
802
break;
803
case AArch64::INSvi64lane:
804
Changed |= visitINSvi64lane(MI);
805
break;
806
case AArch64::FMOVDr:
807
Changed |= visitFMOVDr(MI);
808
break;
809
case AArch64::COPY:
810
Changed |= visitCopy(MI);
811
break;
812
}
813
}
814
}
815
816
return Changed;
817
}
818
819
FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
820
return new AArch64MIPeepholeOpt();
821
}
822
823