CoCalc -- AArch64MIPeepholeOpt.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
³⁵²⁶⁷ views
1
//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass performs below peephole optimizations on MIR level.
10
//
11
// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12
//    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13
//
14
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15
//    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16
//
17
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18
//    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19
//
20
//    The mov pseudo instruction could be expanded to multiple mov instructions
21
//    later. In this case, we could try to split the constant  operand of mov
22
//    instruction into two immediates which can be directly encoded into
23
//    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24
//    multiple `mov` + `and/add/sub` instructions.
25
//
26
// 4. Remove redundant ORRWrs which is generated by zero-extend.
27
//
28
//    %3:gpr32 = ORRWrs $wzr, %2, 0
29
//    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30
//
31
//    If AArch64's 32-bit form of instruction defines the source operand of
32
//    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33
//    operand are set to zero.
34
//
35
// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36
//     ==> %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
37
//
38
// 6. %intermediate:gpr32 = COPY %src:fpr128
39
//    %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40
//     ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41
//
42
//    In cases where a source FPR is copied to a GPR in order to be copied
43
//    to a destination FPR, we can directly copy the values between the FPRs,
44
//    eliminating the use of the Integer unit. When we match a pattern of
45
//    INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46
//    source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47
//    instructions.
48
//
49
// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50
//    64-bits. For example,
51
//
52
//   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53
//   %2:fpr64 = MOVID 0
54
//   %4:fpr128 = IMPLICIT_DEF
55
//   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56
//   %6:fpr128 = IMPLICIT_DEF
57
//   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58
//   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59
//   ==>
60
//   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61
//   %6:fpr128 = IMPLICIT_DEF
62
//   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63
//
64
//===----------------------------------------------------------------------===//
65

66
#include "AArch64ExpandImm.h"
67
#include "AArch64InstrInfo.h"
68
#include "MCTargetDesc/AArch64AddressingModes.h"
69
#include "llvm/CodeGen/MachineDominators.h"
70
#include "llvm/CodeGen/MachineLoopInfo.h"
71

72
using namespace llvm;
73

74
#define DEBUG_TYPE "aarch64-mi-peephole-opt"
75

76
namespace {
77

78
struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79
  static char ID;
80

81
  AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
82
    initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
83
  }
84

85
  const AArch64InstrInfo *TII;
86
  const AArch64RegisterInfo *TRI;
87
  MachineLoopInfo *MLI;
88
  MachineRegisterInfo *MRI;
89

90
  using OpcodePair = std::pair<unsigned, unsigned>;
91
  template <typename T>
92
  using SplitAndOpcFunc =
93
      std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94
  using BuildMIFunc =
95
      std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
96
                         Register, Register, Register)>;
97

98
  /// For instructions where an immediate operand could be split into two
99
  /// separate immediate instructions, use the splitTwoPartImm two handle the
100
  /// optimization.
101
  ///
102
  /// To implement, the following function types must be passed to
103
  /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104
  /// splitting the immediate is valid and returns the associated new opcode. A
105
  /// BuildMIFunc must be implemented to build the two immediate instructions.
106
  ///
107
  /// Example Pattern (where IMM would require 2+ MOV instructions):
108
  ///     %dst = <Instr>rr %src IMM [...]
109
  /// becomes:
110
  ///     %tmp = <Instr>ri %src (encode half IMM) [...]
111
  ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
112
  template <typename T>
113
  bool splitTwoPartImm(MachineInstr &MI,
114
                       SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115

116
  bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117
                        MachineInstr *&SubregToRegMI);
118

119
  template <typename T>
120
  bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121
  template <typename T>
122
  bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123

124
  template <typename T>
125
  bool visitAND(unsigned Opc, MachineInstr &MI);
126
  bool visitORR(MachineInstr &MI);
127
  bool visitINSERT(MachineInstr &MI);
128
  bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129
  bool visitINSvi64lane(MachineInstr &MI);
130
  bool visitFMOVDr(MachineInstr &MI);
131
  bool visitCopy(MachineInstr &MI);
132
  bool runOnMachineFunction(MachineFunction &MF) override;
133

134
  StringRef getPassName() const override {
135
    return "AArch64 MI Peephole Optimization pass";
136
  }
137

138
  void getAnalysisUsage(AnalysisUsage &AU) const override {
139
    AU.setPreservesCFG();
140
    AU.addRequired<MachineLoopInfoWrapperPass>();
141
    MachineFunctionPass::getAnalysisUsage(AU);
142
  }
143
};
144

145
char AArch64MIPeepholeOpt::ID = 0;
146

147
} // end anonymous namespace
148

149
INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
150
                "AArch64 MI Peephole Optimization", false, false)
151

152
template <typename T>
153
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
154
  T UImm = static_cast<T>(Imm);
155
  if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
156
    return false;
157

158
  // If this immediate can be handled by one instruction, do not split it.
159
  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
160
  AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
161
  if (Insn.size() == 1)
162
    return false;
163

164
  // The bitmask immediate consists of consecutive ones.  Let's say there is
165
  // constant 0b00000000001000000000010000000000 which does not consist of
166
  // consecutive ones. We can split it in to two bitmask immediate like
167
  // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
168
  // If we do AND with these two bitmask immediate, we can see original one.
169
  unsigned LowestBitSet = llvm::countr_zero(UImm);
170
  unsigned HighestBitSet = Log2_64(UImm);
171

172
  // Create a mask which is filled with one from the position of lowest bit set
173
  // to the position of highest bit set.
174
  T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
175
              (static_cast<T>(1) << LowestBitSet);
176
  // Create a mask which is filled with one outside the position of lowest bit
177
  // set and the position of highest bit set.
178
  T NewImm2 = UImm | ~NewImm1;
179

180
  // If the split value is not valid bitmask immediate, do not split this
181
  // constant.
182
  if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
183
    return false;
184

185
  Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
186
  Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
187
  return true;
188
}
189

190
template <typename T>
191
bool AArch64MIPeepholeOpt::visitAND(
192
    unsigned Opc, MachineInstr &MI) {
193
  // Try below transformation.
194
  //
195
  // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
196
  // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
197
  //
198
  // The mov pseudo instruction could be expanded to multiple mov instructions
199
  // later. Let's try to split the constant operand of mov instruction into two
200
  // bitmask immediates. It makes only two AND instructions intead of multiple
201
  // mov + and instructions.
202

203
  return splitTwoPartImm<T>(
204
      MI,
205
      [Opc](T Imm, unsigned RegSize, T &Imm0,
206
            T &Imm1) -> std::optional<OpcodePair> {
207
        if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
208
          return std::make_pair(Opc, Opc);
209
        return std::nullopt;
210
      },
211
      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
212
                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
213
                   Register NewDstReg) {
214
        DebugLoc DL = MI.getDebugLoc();
215
        MachineBasicBlock *MBB = MI.getParent();
216
        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
217
            .addReg(SrcReg)
218
            .addImm(Imm0);
219
        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
220
            .addReg(NewTmpReg)
221
            .addImm(Imm1);
222
      });
223
}
224

225
bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
226
  // Check this ORR comes from below zero-extend pattern.
227
  //
228
  // def : Pat<(i64 (zext GPR32:$src)),
229
  //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
230
  if (MI.getOperand(3).getImm() != 0)
231
    return false;
232

233
  if (MI.getOperand(1).getReg() != AArch64::WZR)
234
    return false;
235

236
  MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
237
  if (!SrcMI)
238
    return false;
239

240
  // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
241
  //
242
  // When you use the 32-bit form of an instruction, the upper 32 bits of the
243
  // source registers are ignored and the upper 32 bits of the destination
244
  // register are set to zero.
245
  //
246
  // If AArch64's 32-bit form of instruction defines the source operand of
247
  // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
248
  // real AArch64 instruction and if it is not, do not process the opcode
249
  // conservatively.
250
  if (SrcMI->getOpcode() == TargetOpcode::COPY &&
251
      SrcMI->getOperand(1).getReg().isVirtual()) {
252
    const TargetRegisterClass *RC =
253
        MRI->getRegClass(SrcMI->getOperand(1).getReg());
254

255
    // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
256
    // that the upper bits are zero.
257
    if (RC != &AArch64::FPR32RegClass &&
258
        ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
259
         SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
260
      return false;
261
    Register CpySrc = SrcMI->getOperand(1).getReg();
262
    if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
263
      CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
264
      BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
265
              TII->get(TargetOpcode::COPY), CpySrc)
266
          .add(SrcMI->getOperand(1));
267
    }
268
    BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
269
            TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
270
        .addReg(CpySrc);
271
    SrcMI->eraseFromParent();
272
  }
273
  else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
274
    return false;
275

276
  Register DefReg = MI.getOperand(0).getReg();
277
  Register SrcReg = MI.getOperand(2).getReg();
278
  MRI->replaceRegWith(DefReg, SrcReg);
279
  MRI->clearKillFlags(SrcReg);
280
  LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
281
  MI.eraseFromParent();
282

283
  return true;
284
}
285

286
bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
287
  // Check this INSERT_SUBREG comes from below zero-extend pattern.
288
  //
289
  // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
290
  // To   %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
291
  //
292
  // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
293
  // COPY would destroy the upper part of the register anyway
294
  if (!MI.isRegTiedToDefOperand(1))
295
    return false;
296

297
  Register DstReg = MI.getOperand(0).getReg();
298
  const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
299
  MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
300
  if (!SrcMI)
301
    return false;
302

303
  // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
304
  //
305
  // When you use the 32-bit form of an instruction, the upper 32 bits of the
306
  // source registers are ignored and the upper 32 bits of the destination
307
  // register are set to zero.
308
  //
309
  // If AArch64's 32-bit form of instruction defines the source operand of
310
  // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
311
  // real AArch64 instruction and if it is not, do not process the opcode
312
  // conservatively.
313
  if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
314
      !AArch64::GPR64allRegClass.hasSubClassEq(RC))
315
    return false;
316

317
  // Build a SUBREG_TO_REG instruction
318
  MachineInstr *SubregMI =
319
      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
320
              TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
321
          .addImm(0)
322
          .add(MI.getOperand(2))
323
          .add(MI.getOperand(3));
324
  LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *SubregMI << "\n");
325
  (void)SubregMI;
326
  MI.eraseFromParent();
327

328
  return true;
329
}
330

331
template <typename T>
332
static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
333
  // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
334
  // imm0 and imm1 are non-zero 12-bit unsigned int.
335
  if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
336
      (Imm & ~static_cast<T>(0xffffff)) != 0)
337
    return false;
338

339
  // The immediate can not be composed via a single instruction.
340
  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
341
  AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
342
  if (Insn.size() == 1)
343
    return false;
344

345
  // Split Imm into (Imm0 << 12) + Imm1;
346
  Imm0 = (Imm >> 12) & 0xfff;
347
  Imm1 = Imm & 0xfff;
348
  return true;
349
}
350

351
template <typename T>
352
bool AArch64MIPeepholeOpt::visitADDSUB(
353
    unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
354
  // Try below transformation.
355
  //
356
  // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
357
  // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
358
  //
359
  // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
360
  // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
361
  //
362
  // The mov pseudo instruction could be expanded to multiple mov instructions
363
  // later. Let's try to split the constant operand of mov instruction into two
364
  // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
365
  // multiple `mov` + `and/sub` instructions.
366

367
  // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
368
  // folded. Make sure that we don't generate invalid instructions that use XZR
369
  // in those cases.
370
  if (MI.getOperand(1).getReg() == AArch64::XZR ||
371
      MI.getOperand(1).getReg() == AArch64::WZR)
372
    return false;
373

374
  return splitTwoPartImm<T>(
375
      MI,
376
      [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
377
                       T &Imm1) -> std::optional<OpcodePair> {
378
        if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
379
          return std::make_pair(PosOpc, PosOpc);
380
        if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
381
          return std::make_pair(NegOpc, NegOpc);
382
        return std::nullopt;
383
      },
384
      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
385
                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
386
                   Register NewDstReg) {
387
        DebugLoc DL = MI.getDebugLoc();
388
        MachineBasicBlock *MBB = MI.getParent();
389
        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
390
            .addReg(SrcReg)
391
            .addImm(Imm0)
392
            .addImm(12);
393
        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
394
            .addReg(NewTmpReg)
395
            .addImm(Imm1)
396
            .addImm(0);
397
      });
398
}
399

400
template <typename T>
401
bool AArch64MIPeepholeOpt::visitADDSSUBS(
402
    OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
403
  // Try the same transformation as ADDSUB but with additional requirement
404
  // that the condition code usages are only for Equal and Not Equal
405

406
  if (MI.getOperand(1).getReg() == AArch64::XZR ||
407
      MI.getOperand(1).getReg() == AArch64::WZR)
408
    return false;
409

410
  return splitTwoPartImm<T>(
411
      MI,
412
      [PosOpcs, NegOpcs, &MI, &TRI = TRI,
413
       &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
414
                   T &Imm1) -> std::optional<OpcodePair> {
415
        OpcodePair OP;
416
        if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
417
          OP = PosOpcs;
418
        else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
419
          OP = NegOpcs;
420
        else
421
          return std::nullopt;
422
        // Check conditional uses last since it is expensive for scanning
423
        // proceeding instructions
424
        MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
425
        std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
426
        if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
427
          return std::nullopt;
428
        return OP;
429
      },
430
      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
431
                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
432
                   Register NewDstReg) {
433
        DebugLoc DL = MI.getDebugLoc();
434
        MachineBasicBlock *MBB = MI.getParent();
435
        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
436
            .addReg(SrcReg)
437
            .addImm(Imm0)
438
            .addImm(12);
439
        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
440
            .addReg(NewTmpReg)
441
            .addImm(Imm1)
442
            .addImm(0);
443
      });
444
}
445

446
// Checks if the corresponding MOV immediate instruction is applicable for
447
// this peephole optimization.
448
bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
449
                                            MachineInstr *&MovMI,
450
                                            MachineInstr *&SubregToRegMI) {
451
  // Check whether current MBB is in loop and the AND is loop invariant.
452
  MachineBasicBlock *MBB = MI.getParent();
453
  MachineLoop *L = MLI->getLoopFor(MBB);
454
  if (L && !L->isLoopInvariant(MI))
455
    return false;
456

457
  // Check whether current MI's operand is MOV with immediate.
458
  MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
459
  if (!MovMI)
460
    return false;
461

462
  // If it is SUBREG_TO_REG, check its operand.
463
  SubregToRegMI = nullptr;
464
  if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
465
    SubregToRegMI = MovMI;
466
    MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
467
    if (!MovMI)
468
      return false;
469
  }
470

471
  if (MovMI->getOpcode() != AArch64::MOVi32imm &&
472
      MovMI->getOpcode() != AArch64::MOVi64imm)
473
    return false;
474

475
  // If the MOV has multiple uses, do not split the immediate because it causes
476
  // more instructions.
477
  if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
478
    return false;
479
  if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
480
    return false;
481

482
  // It is OK to perform this peephole optimization.
483
  return true;
484
}
485

486
template <typename T>
487
bool AArch64MIPeepholeOpt::splitTwoPartImm(
488
    MachineInstr &MI,
489
    SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
490
  unsigned RegSize = sizeof(T) * 8;
491
  assert((RegSize == 32 || RegSize == 64) &&
492
         "Invalid RegSize for legal immediate peephole optimization");
493

494
  // Perform several essential checks against current MI.
495
  MachineInstr *MovMI, *SubregToRegMI;
496
  if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
497
    return false;
498

499
  // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
500
  T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
501
  // For the 32 bit form of instruction, the upper 32 bits of the destination
502
  // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
503
  // of Imm to zero. This is essential if the Immediate value was a negative
504
  // number since it was sign extended when we assign to the 64-bit Imm.
505
  if (SubregToRegMI)
506
    Imm &= 0xFFFFFFFF;
507
  OpcodePair Opcode;
508
  if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
509
    Opcode = *R;
510
  else
511
    return false;
512

513
  // Create new MIs using the first and second opcodes. Opcodes might differ for
514
  // flag setting operations that should only set flags on second instruction.
515
  // NewTmpReg = Opcode.first SrcReg Imm0
516
  // NewDstReg = Opcode.second NewTmpReg Imm1
517

518
  // Determine register classes for destinations and register operands
519
  MachineFunction *MF = MI.getMF();
520
  const TargetRegisterClass *FirstInstrDstRC =
521
      TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
522
  const TargetRegisterClass *FirstInstrOperandRC =
523
      TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
524
  const TargetRegisterClass *SecondInstrDstRC =
525
      (Opcode.first == Opcode.second)
526
          ? FirstInstrDstRC
527
          : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
528
  const TargetRegisterClass *SecondInstrOperandRC =
529
      (Opcode.first == Opcode.second)
530
          ? FirstInstrOperandRC
531
          : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
532

533
  // Get old registers destinations and new register destinations
534
  Register DstReg = MI.getOperand(0).getReg();
535
  Register SrcReg = MI.getOperand(1).getReg();
536
  Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
537
  // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
538
  // reuse that same destination register.
539
  Register NewDstReg = DstReg.isVirtual()
540
                           ? MRI->createVirtualRegister(SecondInstrDstRC)
541
                           : DstReg;
542

543
  // Constrain registers based on their new uses
544
  MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
545
  MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
546
  if (DstReg != NewDstReg)
547
    MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
548

549
  // Call the delegating operation to build the instruction
550
  BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
551

552
  // replaceRegWith changes MI's definition register. Keep it for SSA form until
553
  // deleting MI. Only if we made a new destination register.
554
  if (DstReg != NewDstReg) {
555
    MRI->replaceRegWith(DstReg, NewDstReg);
556
    MI.getOperand(0).setReg(DstReg);
557
  }
558

559
  // Record the MIs need to be removed.
560
  MI.eraseFromParent();
561
  if (SubregToRegMI)
562
    SubregToRegMI->eraseFromParent();
563
  MovMI->eraseFromParent();
564

565
  return true;
566
}
567

568
bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
569
  // Check if this INSvi[X]gpr comes from COPY of a source FPR128
570
  //
571
  // From
572
  //  %intermediate1:gpr64 = COPY %src:fpr128
573
  //  %intermediate2:gpr32 = COPY %intermediate1:gpr64
574
  //  %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
575
  // To
576
  //  %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
577
  //  src_index
578
  // where src_index = 0, X = [8|16|32|64]
579

580
  MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
581

582
  // For a chain of COPY instructions, find the initial source register
583
  // and check if it's an FPR128
584
  while (true) {
585
    if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
586
      return false;
587

588
    if (!SrcMI->getOperand(1).getReg().isVirtual())
589
      return false;
590

591
    if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
592
        &AArch64::FPR128RegClass) {
593
      break;
594
    }
595
    SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
596
  }
597

598
  Register DstReg = MI.getOperand(0).getReg();
599
  Register SrcReg = SrcMI->getOperand(1).getReg();
600
  MachineInstr *INSvilaneMI =
601
      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
602
          .add(MI.getOperand(1))
603
          .add(MI.getOperand(2))
604
          .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
605
          .addImm(0);
606

607
  LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *INSvilaneMI << "\n");
608
  (void)INSvilaneMI;
609
  MI.eraseFromParent();
610
  return true;
611
}
612

613
// All instructions that set a FPR64 will implicitly zero the top bits of the
614
// register.
615
static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
616
                                        MachineRegisterInfo *MRI) {
617
  if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
618
    return false;
619
  const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
620
  if (RC != &AArch64::FPR64RegClass)
621
    return false;
622
  return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
623
}
624

625
bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
626
  // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
627
  // We are expecting below case.
628
  //
629
  //  %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
630
  //  %6:fpr128 = IMPLICIT_DEF
631
  //  %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
632
  //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
633
  MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
634
  if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
635
    return false;
636
  Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
637
  if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
638
    return false;
639

640
  // Check there is `mov 0` MI for high 64-bits.
641
  // We are expecting below cases.
642
  //
643
  //  %2:fpr64 = MOVID 0
644
  //  %4:fpr128 = IMPLICIT_DEF
645
  //  %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
646
  //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
647
  // or
648
  //  %5:fpr128 = MOVIv2d_ns 0
649
  //  %6:fpr64 = COPY %5.dsub:fpr128
650
  //  %8:fpr128 = IMPLICIT_DEF
651
  //  %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
652
  //  %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
653
  MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
654
  if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
655
    return false;
656
  High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
657
  if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
658
    High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
659
  if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
660
                    High64MI->getOpcode() != AArch64::MOVIv2d_ns))
661
    return false;
662
  if (High64MI->getOperand(1).getImm() != 0)
663
    return false;
664

665
  // Let's remove MIs for high 64-bits.
666
  Register OldDef = MI.getOperand(0).getReg();
667
  Register NewDef = MI.getOperand(1).getReg();
668
  MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
669
  MRI->replaceRegWith(OldDef, NewDef);
670
  MI.eraseFromParent();
671

672
  return true;
673
}
674

675
bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
676
  // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
677
  MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
678
  if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
679
    return false;
680

681
  // Let's remove MIs for high 64-bits.
682
  Register OldDef = MI.getOperand(0).getReg();
683
  Register NewDef = MI.getOperand(1).getReg();
684
  LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
685
  MRI->clearKillFlags(OldDef);
686
  MRI->clearKillFlags(NewDef);
687
  MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
688
  MRI->replaceRegWith(OldDef, NewDef);
689
  MI.eraseFromParent();
690

691
  return true;
692
}
693

694
// Across a basic-block we might have in i32 extract from a value that only
695
// operates on upper bits (for example a sxtw). We can replace the COPY with a
696
// new version skipping the sxtw.
697
bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
698
  Register InputReg = MI.getOperand(1).getReg();
699
  if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
700
      !MRI->hasOneNonDBGUse(InputReg))
701
    return false;
702

703
  MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
704
  SmallPtrSet<MachineInstr *, 4> DeadInstrs;
705
  DeadInstrs.insert(SrcMI);
706
  while (SrcMI && SrcMI->isFullCopy() &&
707
         MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {
708
    SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
709
    DeadInstrs.insert(SrcMI);
710
  }
711

712
  if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri ||
713
      SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31)
714
    return false;
715

716
  Register SrcReg = SrcMI->getOperand(1).getReg();
717
  MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
718
  LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
719
  MI.getOperand(1).setReg(SrcReg);
720
  LLVM_DEBUG(dbgs() << "        to: " << MI);
721
  for (auto *DeadMI : DeadInstrs) {
722
    LLVM_DEBUG(dbgs() << "  Removing: " << *DeadMI);
723
    DeadMI->eraseFromParent();
724
  }
725
  return true;
726
}
727

728
bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
729
  if (skipFunction(MF.getFunction()))
730
    return false;
731

732
  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
733
  TRI = static_cast<const AArch64RegisterInfo *>(
734
      MF.getSubtarget().getRegisterInfo());
735
  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
736
  MRI = &MF.getRegInfo();
737

738
  assert(MRI->isSSA() && "Expected to be run on SSA form!");
739

740
  bool Changed = false;
741

742
  for (MachineBasicBlock &MBB : MF) {
743
    for (MachineInstr &MI : make_early_inc_range(MBB)) {
744
      switch (MI.getOpcode()) {
745
      default:
746
        break;
747
      case AArch64::INSERT_SUBREG:
748
        Changed |= visitINSERT(MI);
749
        break;
750
      case AArch64::ANDWrr:
751
        Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
752
        break;
753
      case AArch64::ANDXrr:
754
        Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
755
        break;
756
      case AArch64::ORRWrs:
757
        Changed |= visitORR(MI);
758
        break;
759
      case AArch64::ADDWrr:
760
        Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
761
        break;
762
      case AArch64::SUBWrr:
763
        Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
764
        break;
765
      case AArch64::ADDXrr:
766
        Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
767
        break;
768
      case AArch64::SUBXrr:
769
        Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
770
        break;
771
      case AArch64::ADDSWrr:
772
        Changed |=
773
            visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
774
                                    {AArch64::SUBWri, AArch64::SUBSWri}, MI);
775
        break;
776
      case AArch64::SUBSWrr:
777
        Changed |=
778
            visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
779
                                    {AArch64::ADDWri, AArch64::ADDSWri}, MI);
780
        break;
781
      case AArch64::ADDSXrr:
782
        Changed |=
783
            visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
784
                                    {AArch64::SUBXri, AArch64::SUBSXri}, MI);
785
        break;
786
      case AArch64::SUBSXrr:
787
        Changed |=
788
            visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
789
                                    {AArch64::ADDXri, AArch64::ADDSXri}, MI);
790
        break;
791
      case AArch64::INSvi64gpr:
792
        Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
793
        break;
794
      case AArch64::INSvi32gpr:
795
        Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
796
        break;
797
      case AArch64::INSvi16gpr:
798
        Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
799
        break;
800
      case AArch64::INSvi8gpr:
801
        Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
802
        break;
803
      case AArch64::INSvi64lane:
804
        Changed |= visitINSvi64lane(MI);
805
        break;
806
      case AArch64::FMOVDr:
807
        Changed |= visitFMOVDr(MI);
808
        break;
809
      case AArch64::COPY:
810
        Changed |= visitCopy(MI);
811
        break;
812
      }
813
    }
814
  }
815

816
  return Changed;
817
}
818

819
FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
820
  return new AArch64MIPeepholeOpt();
821
}
822

823
Product

Resources

Company